Adjust clang format to the new versions, revert broken macro layout (#714)
このコミットが含まれているのは:
@@ -1,10 +1,10 @@
|
||||
Language: Cpp
|
||||
BasedOnStyle: Google
|
||||
AlignEscapedNewlinesLeft: false
|
||||
AlignOperands: false
|
||||
AlignOperands: Align
|
||||
ColumnLimit: 100
|
||||
AlwaysBreakTemplateDeclarations: false
|
||||
BreakTemplateDeclarations: No
|
||||
DerivePointerAlignment: false
|
||||
IndentFunctionDeclarationAfterType: false
|
||||
MaxEmptyLinesToKeep: 2
|
||||
SortIncludes: false
|
||||
SortIncludes: Never
|
||||
|
||||
@@ -1915,13 +1915,12 @@ __BF16_DEVICE_STATIC__ __hip_bfloat16 unsafeAtomicAdd(__hip_bfloat16* address,
|
||||
static_assert(sizeof(unsigned short int) == sizeof(__hip_bfloat16_raw));
|
||||
unsigned short int* address_as_short = reinterpret_cast<unsigned short int*>(address);
|
||||
// Align to 4 bytes
|
||||
unsigned int* aligned_addr =
|
||||
__builtin_bit_cast(unsigned int*,
|
||||
__builtin_bit_cast(unsigned long long int, address_as_short) &
|
||||
(unsigned long long int)(~0x3));
|
||||
unsigned int* aligned_addr = __builtin_bit_cast(
|
||||
unsigned int*, __builtin_bit_cast(unsigned long long int, address_as_short) &
|
||||
(unsigned long long int)(~0x3));
|
||||
|
||||
bool is_lower = __builtin_bit_cast(unsigned long long int, aligned_addr) ==
|
||||
__builtin_bit_cast(unsigned long long int, address);
|
||||
__builtin_bit_cast(unsigned long long int, address);
|
||||
|
||||
__hip_bfloat162 fval;
|
||||
if (is_lower)
|
||||
|
||||
@@ -375,8 +375,7 @@ class coalesced_group : public thread_group {
|
||||
friend __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent,
|
||||
unsigned int tile_size);
|
||||
friend __CG_QUALIFIER__ coalesced_group binary_partition(const coalesced_group& cgrp, bool pred);
|
||||
template <unsigned int fsize, class fparent>
|
||||
friend __CG_QUALIFIER__ coalesced_group
|
||||
template <unsigned int fsize, class fparent> friend __CG_QUALIFIER__ coalesced_group
|
||||
binary_partition(const thread_block_tile<fsize, fparent>& tgrp, bool pred);
|
||||
|
||||
__CG_QUALIFIER__ coalesced_group new_tiled_group(unsigned int tile_size) const {
|
||||
@@ -393,8 +392,8 @@ class coalesced_group : public thread_group {
|
||||
unsigned int masklength =
|
||||
min(static_cast<unsigned int>(num_threads()) - base_offset, tile_size);
|
||||
lane_mask full_mask = (static_cast<int>(warpSize) == 32)
|
||||
? static_cast<lane_mask>((1u << 32) - 1)
|
||||
: static_cast<lane_mask>(-1ull);
|
||||
? static_cast<lane_mask>((1u << 32) - 1)
|
||||
: static_cast<lane_mask>(-1ull);
|
||||
lane_mask member_mask = full_mask >> (warpSize - masklength);
|
||||
|
||||
member_mask <<= (__lane_id() & ~(tile_size - 1));
|
||||
@@ -485,9 +484,9 @@ class coalesced_group : public thread_group {
|
||||
srcRank = srcRank % static_cast<int>(num_threads());
|
||||
|
||||
int lane = (num_threads() == warpSize) ? srcRank
|
||||
: (static_cast<int>(warpSize) == 64)
|
||||
? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
|
||||
: __fns32(coalesced_info.member_mask, 0, (srcRank + 1));
|
||||
: (static_cast<int>(warpSize) == 64)
|
||||
? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
|
||||
: __fns32(coalesced_info.member_mask, 0, (srcRank + 1));
|
||||
|
||||
return __shfl(var, lane, warpSize);
|
||||
}
|
||||
@@ -835,8 +834,7 @@ template <unsigned int size> class thread_block_tile_base : public tile_base<siz
|
||||
"Tile size is either not a power of 2 or greater than the wavefront size");
|
||||
using tile_base<size>::numThreads;
|
||||
|
||||
template <unsigned int fsize, class fparent>
|
||||
friend __CG_QUALIFIER__ coalesced_group
|
||||
template <unsigned int fsize, class fparent> friend __CG_QUALIFIER__ coalesced_group
|
||||
binary_partition(const thread_block_tile<fsize, fparent>& tgrp, bool pred);
|
||||
|
||||
#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
|
||||
@@ -910,10 +908,10 @@ template <unsigned int tileSize, typename ParentCGTy> class parent_group_info {
|
||||
* \note This type is implemented on Linux, under development
|
||||
* on Microsoft Windows.
|
||||
*/
|
||||
template <unsigned int tileSize, class ParentCGTy>
|
||||
class thread_block_tile_type : public thread_block_tile_base<tileSize>,
|
||||
public tiled_group,
|
||||
public parent_group_info<tileSize, ParentCGTy> {
|
||||
template <unsigned int tileSize, class ParentCGTy> class thread_block_tile_type
|
||||
: public thread_block_tile_base<tileSize>,
|
||||
public tiled_group,
|
||||
public parent_group_info<tileSize, ParentCGTy> {
|
||||
_CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
|
||||
typedef thread_block_tile_base<numThreads> tbtBase;
|
||||
|
||||
@@ -931,9 +929,8 @@ class thread_block_tile_type : public thread_block_tile_base<tileSize>,
|
||||
};
|
||||
|
||||
// Partial template specialization
|
||||
template <unsigned int tileSize>
|
||||
class thread_block_tile_type<tileSize, void> : public thread_block_tile_base<tileSize>,
|
||||
public tiled_group {
|
||||
template <unsigned int tileSize> class thread_block_tile_type<tileSize, void>
|
||||
: public thread_block_tile_base<tileSize>, public tiled_group {
|
||||
_CG_STATIC_CONST_DECL_ unsigned int numThreads = tileSize;
|
||||
|
||||
typedef thread_block_tile_base<numThreads> tbtBase;
|
||||
@@ -1013,11 +1010,10 @@ __CG_QUALIFIER__ coalesced_group tiled_partition(const coalesced_group& parent,
|
||||
namespace impl {
|
||||
template <unsigned int size, class ParentCGTy> class thread_block_tile_internal;
|
||||
|
||||
template <unsigned int size, class ParentCGTy>
|
||||
class thread_block_tile_internal : public thread_block_tile_type<size, ParentCGTy> {
|
||||
template <unsigned int size, class ParentCGTy> class thread_block_tile_internal
|
||||
: public thread_block_tile_type<size, ParentCGTy> {
|
||||
protected:
|
||||
template <unsigned int tbtSize, class tbtParentT>
|
||||
__CG_QUALIFIER__ thread_block_tile_internal(
|
||||
template <unsigned int tbtSize, class tbtParentT> __CG_QUALIFIER__ thread_block_tile_internal(
|
||||
const thread_block_tile_internal<tbtSize, tbtParentT>& g)
|
||||
: thread_block_tile_type<size, ParentCGTy>(g.meta_group_rank(), g.meta_group_size()) {}
|
||||
|
||||
@@ -1034,8 +1030,8 @@ class thread_block_tile_internal : public thread_block_tile_type<size, ParentCGT
|
||||
* \note This type is implemented on Linux, under development
|
||||
* on Microsoft Windows.
|
||||
*/
|
||||
template <unsigned int size, class ParentCGTy>
|
||||
class thread_block_tile : public impl::thread_block_tile_internal<size, ParentCGTy> {
|
||||
template <unsigned int size, class ParentCGTy> class thread_block_tile
|
||||
: public impl::thread_block_tile_internal<size, ParentCGTy> {
|
||||
protected:
|
||||
__CG_QUALIFIER__ thread_block_tile(const ParentCGTy& g)
|
||||
: impl::thread_block_tile_internal<size, ParentCGTy>(g) {}
|
||||
@@ -1171,8 +1167,8 @@ class thread_block_tile : public impl::thread_block_tile_internal<size, ParentCG
|
||||
#endif
|
||||
};
|
||||
|
||||
template <unsigned int size>
|
||||
class thread_block_tile<size, void> : public impl::thread_block_tile_internal<size, void> {
|
||||
template <unsigned int size> class thread_block_tile<size, void>
|
||||
: public impl::thread_block_tile_internal<size, void> {
|
||||
template <unsigned int, class ParentCGTy> friend class thread_block_tile;
|
||||
|
||||
protected:
|
||||
@@ -1187,8 +1183,8 @@ template <unsigned int size, class ParentCGTy = void> class thread_block_tile;
|
||||
namespace impl {
|
||||
template <unsigned int size, class ParentCGTy> struct tiled_partition_internal;
|
||||
|
||||
template <unsigned int size>
|
||||
struct tiled_partition_internal<size, thread_block> : public thread_block_tile<size, thread_block> {
|
||||
template <unsigned int size> struct tiled_partition_internal<size, thread_block>
|
||||
: public thread_block_tile<size, thread_block> {
|
||||
__CG_QUALIFIER__ tiled_partition_internal(const thread_block& g)
|
||||
: thread_block_tile<size, thread_block>(g) {}
|
||||
};
|
||||
|
||||
@@ -82,8 +82,8 @@ namespace __hip_internal {
|
||||
template <> struct is_floating_point<_Float16> : __hip_internal::true_type {};
|
||||
} // namespace __hip_internal
|
||||
|
||||
template <bool cond, typename T = void>
|
||||
using Enable_if_t = typename __hip_internal::enable_if<cond, T>::type;
|
||||
template <bool cond, typename T = void> using Enable_if_t =
|
||||
typename __hip_internal::enable_if<cond, T>::type;
|
||||
|
||||
// BEGIN STRUCT __HALF
|
||||
struct __half {
|
||||
@@ -649,7 +649,7 @@ inline __HOST_DEVICE__ bool __hgt(__half x, __half y) {
|
||||
}
|
||||
inline __HOST_DEVICE__ bool __hequ(__half x, __half y) {
|
||||
return !(static_cast<__half_raw>(x).data < static_cast<__half_raw>(y).data) &&
|
||||
!(static_cast<__half_raw>(x).data > static_cast<__half_raw>(y).data);
|
||||
!(static_cast<__half_raw>(x).data > static_cast<__half_raw>(y).data);
|
||||
}
|
||||
inline __HOST_DEVICE__ bool __hneu(__half x, __half y) {
|
||||
return !(static_cast<__half_raw>(x).data == static_cast<__half_raw>(y).data);
|
||||
@@ -693,7 +693,7 @@ inline __HOST_DEVICE__ __half2 __hgt2(__half2 x, __half2 y) {
|
||||
}
|
||||
inline __HOST_DEVICE__ __half2 __hequ2(__half2 x, __half2 y) {
|
||||
auto r = !(static_cast<__half2_raw>(x).data < static_cast<__half2_raw>(y).data) &&
|
||||
!(static_cast<__half2_raw>(x).data > static_cast<__half2_raw>(y).data);
|
||||
!(static_cast<__half2_raw>(x).data > static_cast<__half2_raw>(y).data);
|
||||
return __builtin_convertvector(-r, _Float16_2);
|
||||
}
|
||||
inline __HOST_DEVICE__ __half2 __hneu2(__half2 x, __half2 y) {
|
||||
@@ -911,13 +911,12 @@ inline __device__ __half unsafeAtomicAdd(__half* address, __half value) {
|
||||
static_assert(sizeof(unsigned short int) == sizeof(__half_raw));
|
||||
unsigned short int* address_as_short = reinterpret_cast<unsigned short int*>(address);
|
||||
// Align to 4 bytes
|
||||
unsigned int* aligned_addr =
|
||||
__builtin_bit_cast(unsigned int*,
|
||||
__builtin_bit_cast(unsigned long long int, address_as_short) &
|
||||
(unsigned long long int)(~0x3));
|
||||
unsigned int* aligned_addr = __builtin_bit_cast(
|
||||
unsigned int*, __builtin_bit_cast(unsigned long long int, address_as_short) &
|
||||
(unsigned long long int)(~0x3));
|
||||
|
||||
bool is_lower = __builtin_bit_cast(unsigned long long int, aligned_addr) ==
|
||||
__builtin_bit_cast(unsigned long long int, address);
|
||||
__builtin_bit_cast(unsigned long long int, address);
|
||||
__half2 fval;
|
||||
if (is_lower)
|
||||
fval = __halves2half2(value, __float2half(0.0f));
|
||||
|
||||
@@ -327,8 +327,8 @@ where exponent==0 (actual exponent -14) and highest bit of mantissa is 1 are bf8
|
||||
this case, the fp16 mantissa should be shift left by 1 */
|
||||
act_exponent = exponent - bias + 1;
|
||||
exponent_diff = f8_denormal_act_exponent -
|
||||
act_exponent; // actual exponent is exponent-bias+1 as it is denormal
|
||||
} else { // fp32/fp16 is normal with implicit 1
|
||||
act_exponent; // actual exponent is exponent-bias+1 as it is denormal
|
||||
} else { // fp32/fp16 is normal with implicit 1
|
||||
act_exponent = exponent - bias;
|
||||
if (act_exponent <= f8_denormal_act_exponent) {
|
||||
/* This is the case where fp32/fp16 is normal but it is in f8 denormal range.
|
||||
@@ -345,7 +345,7 @@ So for fp32/fp16, exponent -8 is the cut point to convert to fp8 nanoo */
|
||||
}
|
||||
|
||||
bool midpoint = (mantissa & ((1ull << (mfmt - wm + exponent_diff)) - 1)) ==
|
||||
(1ull << (mfmt - wm + exponent_diff - 1));
|
||||
(1ull << (mfmt - wm + exponent_diff - 1));
|
||||
/* This part is a bit tricky. The judgment of whether it is a tie needs to be done before we shift
|
||||
right as shift right could rip off some residual part and make something not midpoint look like
|
||||
midpoint. For example, the fp16 number 0x1002 (0 00100 0000000010), it is larger than midpoint, but
|
||||
@@ -400,9 +400,9 @@ after shift right by 4 bits, it would look like midpoint.
|
||||
// The conversion function is from rocblas
|
||||
// https://github.com/ROCm/rocBLAS/blob/9b7f692abe3c54b88d1e77e045a7db7f1f188b69/library/include/internal/rocblas_hip_f8_impl.h#L220
|
||||
// This has been modified to handle double types as well
|
||||
template <typename T, bool is_fnuz>
|
||||
__FP8_HOST_DEVICE_STATIC__ T cast_from_f8(__hip_fp8_storage_t x, int wm, int we,
|
||||
bool clip = false) {
|
||||
template <typename T, bool is_fnuz> __FP8_HOST_DEVICE_STATIC__ T cast_from_f8(__hip_fp8_storage_t x,
|
||||
int wm, int we,
|
||||
bool clip = false) {
|
||||
#if defined(__clang__) and defined(__HIP__)
|
||||
constexpr bool is_half = __hip_internal::is_same<T, _Float16>::value;
|
||||
constexpr bool is_float = __hip_internal::is_same<T, float>::value;
|
||||
@@ -576,14 +576,15 @@ static __device__ __hip_fp8_storage_t cast_to_f8_from_f32(float v, bool saturate
|
||||
|
||||
if (stochastic_rounding) {
|
||||
ival = (interpret == __HIP_E4M3_FNUZ) || (interpret == __HIP_E4M3)
|
||||
? __builtin_amdgcn_cvt_sr_fp8_f32(val.fval, rng, ival, 0)
|
||||
: __builtin_amdgcn_cvt_sr_bf8_f32(val.fval, rng, ival, 0); // 0 pos
|
||||
? __builtin_amdgcn_cvt_sr_fp8_f32(val.fval, rng, ival, 0)
|
||||
: __builtin_amdgcn_cvt_sr_bf8_f32(val.fval, rng, ival, 0); // 0 pos
|
||||
val.i32val = ival;
|
||||
i8data = val.i8val[0]; // little endian
|
||||
} else { // RNE CVT
|
||||
ival = (interpret == __HIP_E4M3_FNUZ) || (interpret == __HIP_E4M3)
|
||||
? __builtin_amdgcn_cvt_pk_fp8_f32(val.fval, val.fval, ival, false)
|
||||
: __builtin_amdgcn_cvt_pk_bf8_f32(val.fval, val.fval, ival, false); // false -> WORD0
|
||||
ival =
|
||||
(interpret == __HIP_E4M3_FNUZ) || (interpret == __HIP_E4M3)
|
||||
? __builtin_amdgcn_cvt_pk_fp8_f32(val.fval, val.fval, ival, false)
|
||||
: __builtin_amdgcn_cvt_pk_bf8_f32(val.fval, val.fval, ival, false); // false -> WORD0
|
||||
val.i32val = ival;
|
||||
i8data = val.i8val[0];
|
||||
}
|
||||
@@ -628,8 +629,8 @@ cast_to_f8x2_from_f32x2(float2 v, bool saturate, __hip_fp8_interpretation_t inte
|
||||
}
|
||||
|
||||
f2val.i32val[0] = (interpret == __HIP_E4M3_FNUZ) || (interpret == __HIP_E4M3)
|
||||
? __builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.y, 0, false)
|
||||
: __builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.y, 0, false);
|
||||
? __builtin_amdgcn_cvt_pk_fp8_f32(v.x, v.y, 0, false)
|
||||
: __builtin_amdgcn_cvt_pk_bf8_f32(v.x, v.y, 0, false);
|
||||
|
||||
return static_cast<__hip_fp8x2_storage_t>(f2val.i16val[0]);
|
||||
}
|
||||
@@ -643,8 +644,8 @@ static __device__ float cast_to_f32_from_f8(__hip_fp8_storage_t v,
|
||||
val.i8val[0] = v;
|
||||
|
||||
float fval = (interpret == __HIP_E4M3_FNUZ) || (interpret == __HIP_E4M3)
|
||||
? __builtin_amdgcn_cvt_f32_fp8(val.i32val, 0)
|
||||
: __builtin_amdgcn_cvt_f32_bf8(val.i32val, 0);
|
||||
? __builtin_amdgcn_cvt_f32_fp8(val.i32val, 0)
|
||||
: __builtin_amdgcn_cvt_f32_bf8(val.i32val, 0);
|
||||
return fval;
|
||||
}
|
||||
|
||||
@@ -657,8 +658,8 @@ static __device__ float2 cast_to_f32x2_from_f8x2(__hip_fp8x2_storage_t v,
|
||||
val.i16val[0] = v;
|
||||
|
||||
auto f2 = (interpret == __HIP_E4M3_FNUZ) || (interpret == __HIP_E4M3)
|
||||
? __builtin_amdgcn_cvt_pk_f32_fp8(val.i32val, false)
|
||||
: __builtin_amdgcn_cvt_pk_f32_bf8(val.i32val, false);
|
||||
? __builtin_amdgcn_cvt_pk_f32_fp8(val.i32val, false)
|
||||
: __builtin_amdgcn_cvt_pk_f32_bf8(val.i32val, false);
|
||||
return float2{f2[0], f2[1]};
|
||||
}
|
||||
#endif // HIP_FP8_CVT_FAST_PATH
|
||||
@@ -672,9 +673,9 @@ __FP8_HOST_DEVICE_STATIC__ bool hip_fp8_fnuz_is_nan(__hip_fp8_storage_t a) {
|
||||
|
||||
__FP8_HOST_DEVICE_STATIC__ bool hip_fp8_ocp_is_nan(__hip_fp8_storage_t a,
|
||||
const __hip_fp8_interpretation_t type) {
|
||||
return (type == __HIP_E4M3) ? ((a & 0x7f) == 0x7f)
|
||||
: (type == __HIP_E5M2) ? ((a & 0x7f) > 0x7c)
|
||||
: false;
|
||||
return (type == __HIP_E4M3) ? ((a & 0x7f) == 0x7f)
|
||||
: (type == __HIP_E5M2) ? ((a & 0x7f) > 0x7c)
|
||||
: false;
|
||||
}
|
||||
|
||||
__FP8_HOST_DEVICE_STATIC__ bool hip_fp8_ocp_is_inf(__hip_fp8_storage_t a,
|
||||
|
||||
@@ -334,13 +334,13 @@ __OCP_FP_HOST_DEVICE_STATIC__ float __amd_cvt_fp8_to_float_scale(
|
||||
const __amd_scale_t scale) {
|
||||
#if HIP_ENABLE_GFX950_OCP_BUILTINS
|
||||
return interpret == __AMD_OCP_E4M3
|
||||
? __builtin_amdgcn_cvt_scalef32_f32_fp8(val, __amd_scale_to_float(scale), 0)
|
||||
: __builtin_amdgcn_cvt_scalef32_f32_bf8(val, __amd_scale_to_float(scale), 0);
|
||||
? __builtin_amdgcn_cvt_scalef32_f32_fp8(val, __amd_scale_to_float(scale), 0)
|
||||
: __builtin_amdgcn_cvt_scalef32_f32_bf8(val, __amd_scale_to_float(scale), 0);
|
||||
#else
|
||||
using namespace fcbx;
|
||||
return interpret == __AMD_OCP_E4M3
|
||||
? to_float<float, Encoding::E4M3, true>(static_cast<uint32_t>(val), scale)
|
||||
: to_float<float, Encoding::E5M2, true>(static_cast<uint32_t>(val), scale);
|
||||
? to_float<float, Encoding::E4M3, true>(static_cast<uint32_t>(val), scale)
|
||||
: to_float<float, Encoding::E5M2, true>(static_cast<uint32_t>(val), scale);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -378,8 +378,8 @@ __amd_cvt_float_to_fp8_sr_scale(const float val, const __amd_fp8_interpretation_
|
||||
} u{0};
|
||||
using namespace fcbx;
|
||||
u.ui32t = interpret == __AMD_OCP_E4M3
|
||||
? from_float_sr<float, Encoding::E4M3, true>(val, seed, scale)
|
||||
: from_float_sr<float, Encoding::E5M2, true>(val, seed, scale);
|
||||
? from_float_sr<float, Encoding::E4M3, true>(val, seed, scale)
|
||||
: from_float_sr<float, Encoding::E5M2, true>(val, seed, scale);
|
||||
return u.fp8[0];
|
||||
#endif
|
||||
}
|
||||
@@ -548,8 +548,8 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_floatx2_storage_t __amd_cvt_fp8x2_to_floatx2
|
||||
const __amd_scale_t scale) {
|
||||
#if HIP_ENABLE_GFX950_OCP_BUILTINS
|
||||
return interpret == __AMD_OCP_E4M3
|
||||
? __builtin_amdgcn_cvt_scalef32_pk_f32_fp8(val, __amd_scale_to_float(scale), false)
|
||||
: __builtin_amdgcn_cvt_scalef32_pk_f32_bf8(val, __amd_scale_to_float(scale), false);
|
||||
? __builtin_amdgcn_cvt_scalef32_pk_f32_fp8(val, __amd_scale_to_float(scale), false)
|
||||
: __builtin_amdgcn_cvt_scalef32_pk_f32_bf8(val, __amd_scale_to_float(scale), false);
|
||||
#else
|
||||
using namespace fcbx;
|
||||
__amd_floatx2_storage_t ret;
|
||||
@@ -582,10 +582,10 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp8x2_storage_t __amd_cvt_floatx2_to_fp8x2_s
|
||||
__amd_fp8x2_storage_t fp8x2[2];
|
||||
} u{0};
|
||||
u.shortx2 = interpret == __AMD_OCP_E4M3
|
||||
? __builtin_amdgcn_cvt_scalef32_pk_fp8_f32(u.shortx2, val[0], val[1],
|
||||
__amd_scale_to_float(scale), false)
|
||||
: __builtin_amdgcn_cvt_scalef32_pk_bf8_f32(u.shortx2, val[0], val[1],
|
||||
__amd_scale_to_float(scale), false);
|
||||
? __builtin_amdgcn_cvt_scalef32_pk_fp8_f32(u.shortx2, val[0], val[1],
|
||||
__amd_scale_to_float(scale), false)
|
||||
: __builtin_amdgcn_cvt_scalef32_pk_bf8_f32(u.shortx2, val[0], val[1],
|
||||
__amd_scale_to_float(scale), false);
|
||||
return u.fp8x2[0];
|
||||
#else
|
||||
using namespace fcbx;
|
||||
@@ -679,8 +679,8 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp16x2_storage_t __amd_cvt_fp8x2_to_fp16x2_s
|
||||
} u;
|
||||
u.fp8x2[0] = val;
|
||||
return interpret == __AMD_OCP_E4M3
|
||||
? __builtin_amdgcn_cvt_scalef32_pk_f16_fp8(u.ui32, __amd_scale_to_float(scale), false)
|
||||
: __builtin_amdgcn_cvt_scalef32_pk_f16_bf8(u.ui32, __amd_scale_to_float(scale), false);
|
||||
? __builtin_amdgcn_cvt_scalef32_pk_f16_fp8(u.ui32, __amd_scale_to_float(scale), false)
|
||||
: __builtin_amdgcn_cvt_scalef32_pk_f16_bf8(u.ui32, __amd_scale_to_float(scale), false);
|
||||
#else
|
||||
using namespace fcbx;
|
||||
__amd_fp16x2_storage_t ret;
|
||||
@@ -787,8 +787,9 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_bf16x2_storage_t __amd_cvt_fp8x2_to_bf16x2_s
|
||||
} u;
|
||||
u.fp8x2[0] = in;
|
||||
return interpret == __AMD_OCP_E4M3
|
||||
? __builtin_amdgcn_cvt_scalef32_pk_bf16_fp8(u.ui32, __amd_scale_to_float(scale), false)
|
||||
: __builtin_amdgcn_cvt_scalef32_pk_bf16_bf8(u.ui32, __amd_scale_to_float(scale), false);
|
||||
? __builtin_amdgcn_cvt_scalef32_pk_bf16_fp8(u.ui32, __amd_scale_to_float(scale), false)
|
||||
: __builtin_amdgcn_cvt_scalef32_pk_bf16_bf8(u.ui32, __amd_scale_to_float(scale),
|
||||
false);
|
||||
#else
|
||||
using namespace fcbx;
|
||||
__amd_bf16x2_storage_t ret;
|
||||
@@ -891,8 +892,8 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp16x32_storage_t __amd_cvt_fp6x32_to_fp16x3
|
||||
#if HIP_ENABLE_GFX950_OCP_BUILTINS
|
||||
// gfx950 expects scale to be in float
|
||||
return interpret == __AMD_OCP_E2M3
|
||||
? __builtin_amdgcn_cvt_scalef32_pk32_f16_fp6(in, __amd_scale_to_float(scale))
|
||||
: __builtin_amdgcn_cvt_scalef32_pk32_f16_bf6(in, __amd_scale_to_float(scale));
|
||||
? __builtin_amdgcn_cvt_scalef32_pk32_f16_fp6(in, __amd_scale_to_float(scale))
|
||||
: __builtin_amdgcn_cvt_scalef32_pk32_f16_bf6(in, __amd_scale_to_float(scale));
|
||||
#else
|
||||
using namespace fcbx;
|
||||
if (interpret == __AMD_OCP_E2M3) {
|
||||
@@ -918,8 +919,8 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_bf16x32_storage_t __amd_cvt_fp6x32_to_bf16x3
|
||||
const __amd_scale_t scale) {
|
||||
#if HIP_ENABLE_GFX950_OCP_BUILTINS
|
||||
return interpret == __AMD_OCP_E2M3
|
||||
? __builtin_amdgcn_cvt_scalef32_pk32_bf16_fp6(in, __amd_scale_to_float(scale))
|
||||
: __builtin_amdgcn_cvt_scalef32_pk32_bf16_bf6(in, __amd_scale_to_float(scale));
|
||||
? __builtin_amdgcn_cvt_scalef32_pk32_bf16_fp6(in, __amd_scale_to_float(scale))
|
||||
: __builtin_amdgcn_cvt_scalef32_pk32_bf16_bf6(in, __amd_scale_to_float(scale));
|
||||
#else
|
||||
using namespace fcbx;
|
||||
if (interpret == __AMD_OCP_E2M3) {
|
||||
@@ -937,15 +938,15 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_floatx32_storage_t __amd_cvt_fp6x32_to_float
|
||||
const __amd_scale_t scale) {
|
||||
#if HIP_ENABLE_GFX950_OCP_BUILTINS
|
||||
return interpret == __AMD_OCP_E2M3
|
||||
? __builtin_amdgcn_cvt_scalef32_pk32_f32_fp6(val, __amd_scale_to_float(scale))
|
||||
: __builtin_amdgcn_cvt_scalef32_pk32_f32_bf6(val, __amd_scale_to_float(scale));
|
||||
? __builtin_amdgcn_cvt_scalef32_pk32_f32_fp6(val, __amd_scale_to_float(scale))
|
||||
: __builtin_amdgcn_cvt_scalef32_pk32_f32_bf6(val, __amd_scale_to_float(scale));
|
||||
#else
|
||||
using namespace fcbx;
|
||||
return interpret == __AMD_OCP_E2M3
|
||||
? fp6_cvt_packedx32<__amd_fp6x32_storage_t, __amd_floatx32_storage_t, float, Encoding::E2M3,
|
||||
Encoding::IEEE754>(val, scale)
|
||||
: fp6_cvt_packedx32<__amd_fp6x32_storage_t, __amd_floatx32_storage_t, float, Encoding::E3M2,
|
||||
Encoding::IEEE754>(val, scale);
|
||||
? fp6_cvt_packedx32<__amd_fp6x32_storage_t, __amd_floatx32_storage_t, float,
|
||||
Encoding::E2M3, Encoding::IEEE754>(val, scale)
|
||||
: fp6_cvt_packedx32<__amd_fp6x32_storage_t, __amd_floatx32_storage_t, float,
|
||||
Encoding::E3M2, Encoding::IEEE754>(val, scale);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1200,9 +1201,10 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp8x2_storage_t __amd_cvt_fp16x2_to_fp8x2_sc
|
||||
__amd_shortx2_storage_t shortx2;
|
||||
__amd_fp8x2_storage_t fp8x2[2];
|
||||
} u{0};
|
||||
u.shortx2 = interpret == __AMD_OCP_E4M3
|
||||
? __builtin_amdgcn_cvt_scalef32_pk_fp8_f16(u.shortx2, in, __amd_scale_to_float(scale), false)
|
||||
: __builtin_amdgcn_cvt_scalef32_pk_bf8_f16(u.shortx2, in, __amd_scale_to_float(scale), false);
|
||||
u.shortx2 = interpret == __AMD_OCP_E4M3 ? __builtin_amdgcn_cvt_scalef32_pk_fp8_f16(
|
||||
u.shortx2, in, __amd_scale_to_float(scale), false)
|
||||
: __builtin_amdgcn_cvt_scalef32_pk_bf8_f16(
|
||||
u.shortx2, in, __amd_scale_to_float(scale), false);
|
||||
return u.fp8x2[0];
|
||||
#else
|
||||
static_assert(sizeof(__amd_fp8x2_storage_t[2]) == sizeof(uint32_t));
|
||||
@@ -1241,10 +1243,10 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp8x2_storage_t __amd_cvt_bf16x2_to_fp8x2_sc
|
||||
__amd_shortx2_storage_t shortx2;
|
||||
__amd_fp8x2_storage_t fp8x2[2];
|
||||
} u{0};
|
||||
u.shortx2 = interpret == __AMD_OCP_E4M3
|
||||
? __builtin_amdgcn_cvt_scalef32_pk_fp8_bf16(u.shortx2, in, __amd_scale_to_float(scale), false)
|
||||
: __builtin_amdgcn_cvt_scalef32_pk_bf8_bf16(u.shortx2, in, __amd_scale_to_float(scale),
|
||||
false);
|
||||
u.shortx2 = interpret == __AMD_OCP_E4M3 ? __builtin_amdgcn_cvt_scalef32_pk_fp8_bf16(
|
||||
u.shortx2, in, __amd_scale_to_float(scale), false)
|
||||
: __builtin_amdgcn_cvt_scalef32_pk_bf8_bf16(
|
||||
u.shortx2, in, __amd_scale_to_float(scale), false);
|
||||
return u.fp8x2[0];
|
||||
#else
|
||||
using namespace fcbx;
|
||||
@@ -1429,9 +1431,10 @@ __amd_cvt_fp8_to_fp16_scale(const __amd_fp8_storage_t val,
|
||||
const __amd_fp8_interpretation_t interpret, const __amd_scale_t scale) {
|
||||
#if HIP_ENABLE_GFX950_OCP_BUILTINS
|
||||
__amd_fp16x2_storage_t ret;
|
||||
ret = interpret == __AMD_OCP_E4M3
|
||||
? __builtin_amdgcn_cvt_scalef32_f16_fp8(ret, val, __amd_scale_to_float(scale), 0, false)
|
||||
: __builtin_amdgcn_cvt_scalef32_f16_bf8(ret, val, __amd_scale_to_float(scale), 0, false);
|
||||
ret =
|
||||
interpret == __AMD_OCP_E4M3
|
||||
? __builtin_amdgcn_cvt_scalef32_f16_fp8(ret, val, __amd_scale_to_float(scale), 0, false)
|
||||
: __builtin_amdgcn_cvt_scalef32_f16_bf8(ret, val, __amd_scale_to_float(scale), 0, false);
|
||||
return ret[0];
|
||||
#else
|
||||
using namespace fcbx;
|
||||
@@ -1463,9 +1466,10 @@ __amd_cvt_fp8_to_bf16_scale(const __amd_fp8_storage_t val,
|
||||
unsigned int ui32;
|
||||
} u{0};
|
||||
u.fp8[0] = val;
|
||||
auto ret = interpret == __AMD_OCP_E4M3
|
||||
? __builtin_amdgcn_cvt_scalef32_pk_bf16_fp8(u.ui32, __amd_scale_to_float(scale), false)
|
||||
: __builtin_amdgcn_cvt_scalef32_pk_bf16_bf8(u.ui32, __amd_scale_to_float(scale), false);
|
||||
auto ret =
|
||||
interpret == __AMD_OCP_E4M3
|
||||
? __builtin_amdgcn_cvt_scalef32_pk_bf16_fp8(u.ui32, __amd_scale_to_float(scale), false)
|
||||
: __builtin_amdgcn_cvt_scalef32_pk_bf16_bf8(u.ui32, __amd_scale_to_float(scale), false);
|
||||
return ret[0];
|
||||
#else
|
||||
using namespace fcbx;
|
||||
@@ -1491,8 +1495,8 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp6x32_storage_t __amd_cvt_floatx16_floatx16
|
||||
const __amd_fp6_interpretation_t interpret, const __amd_scale_t scale) {
|
||||
#if HIP_ENABLE_GFX950_OCP_BUILTINS
|
||||
return interpret == __AMD_OCP_E2M3
|
||||
? __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(in1, in2, __amd_scale_to_float(scale))
|
||||
: __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(in1, in2, __amd_scale_to_float(scale));
|
||||
? __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(in1, in2, __amd_scale_to_float(scale))
|
||||
: __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(in1, in2, __amd_scale_to_float(scale));
|
||||
#else
|
||||
__amd_floatx32_storage_t tmp;
|
||||
for (size_t i = 0; i < 16; i++) {
|
||||
@@ -1503,10 +1507,10 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp6x32_storage_t __amd_cvt_floatx16_floatx16
|
||||
}
|
||||
using namespace fcbx;
|
||||
return interpret == __AMD_OCP_E2M3
|
||||
? fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float,
|
||||
Encoding::IEEE754, Encoding::E2M3>(tmp, scale)
|
||||
: fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float,
|
||||
Encoding::IEEE754, Encoding::E3M2>(tmp, scale);
|
||||
? fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float,
|
||||
Encoding::IEEE754, Encoding::E2M3>(tmp, scale)
|
||||
: fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float,
|
||||
Encoding::IEEE754, Encoding::E3M2>(tmp, scale);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1529,15 +1533,15 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp6x32_storage_t __amd_cvt_floatx32_to_fp6x3
|
||||
in2 = {val[16], val[17], val[18], val[19], val[20], val[21], val[22], val[23],
|
||||
val[24], val[25], val[26], val[27], val[28], val[29], val[30], val[31]};
|
||||
return interpret == __AMD_OCP_E2M3
|
||||
? __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(in1, in2, __amd_scale_to_float(scale))
|
||||
: __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(in1, in2, __amd_scale_to_float(scale));
|
||||
? __builtin_amdgcn_cvt_scalef32_2xpk16_fp6_f32(in1, in2, __amd_scale_to_float(scale))
|
||||
: __builtin_amdgcn_cvt_scalef32_2xpk16_bf6_f32(in1, in2, __amd_scale_to_float(scale));
|
||||
#else
|
||||
using namespace fcbx;
|
||||
return interpret == __AMD_OCP_E2M3
|
||||
? fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float,
|
||||
Encoding::IEEE754, Encoding::E2M3>(val, scale)
|
||||
: fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float,
|
||||
Encoding::IEEE754, Encoding::E3M2>(val, scale);
|
||||
? fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float,
|
||||
Encoding::IEEE754, Encoding::E2M3>(val, scale)
|
||||
: fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float,
|
||||
Encoding::IEEE754, Encoding::E3M2>(val, scale);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1555,16 +1559,17 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp6x32_storage_t __amd_cvt_floatx32_to_fp6x3
|
||||
const unsigned int round, const __amd_scale_t scale) {
|
||||
#if __has_builtin(__builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f32) and \
|
||||
__has_builtin(__builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f32)
|
||||
return interpret == __AMD_OCP_E2M3
|
||||
? __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f32(val, round, __amd_scale_to_float(scale))
|
||||
: __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f32(val, round, __amd_scale_to_float(scale));
|
||||
return interpret == __AMD_OCP_E2M3 ? __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f32(
|
||||
val, round, __amd_scale_to_float(scale))
|
||||
: __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f32(
|
||||
val, round, __amd_scale_to_float(scale));
|
||||
#else
|
||||
using namespace fcbx;
|
||||
return interpret == __AMD_OCP_E2M3
|
||||
? fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float,
|
||||
Encoding::IEEE754, Encoding::E2M3, true>(val, scale, round)
|
||||
: fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float,
|
||||
Encoding::IEEE754, Encoding::E3M2, true>(val, scale, round);
|
||||
? fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float,
|
||||
Encoding::IEEE754, Encoding::E2M3, true>(val, scale, round)
|
||||
: fp6_cvt_packedx32<__amd_floatx32_storage_t, __amd_fp6x32_storage_t, float,
|
||||
Encoding::IEEE754, Encoding::E3M2, true>(val, scale, round);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1638,16 +1643,17 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp6x32_storage_t __amd_cvt_fp16x32_to_fp6x32
|
||||
const unsigned int round, const __amd_scale_t scale) {
|
||||
#if HIP_ENABLE_GFX950_OCP_BUILTINS
|
||||
return interpret == __AMD_OCP_E2M3
|
||||
? __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f16(in, round, __amd_scale_to_float(scale))
|
||||
: __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f16(in, round, __amd_scale_to_float(scale));
|
||||
? __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f16(in, round, __amd_scale_to_float(scale))
|
||||
: __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f16(in, round,
|
||||
__amd_scale_to_float(scale));
|
||||
#else
|
||||
return interpret == __AMD_OCP_E2M3
|
||||
? fcbx::fp6_cvt_packedx32<__amd_fp16x32_storage_t, __amd_fp6x32_storage_t,
|
||||
__amd_fp16_storage_t, fcbx::Encoding::E5M10, fcbx::Encoding::E2M3,
|
||||
true>(in, scale, round)
|
||||
: fcbx::fp6_cvt_packedx32<__amd_fp16x32_storage_t, __amd_fp6x32_storage_t,
|
||||
__amd_fp16_storage_t, fcbx::Encoding::E5M10, fcbx::Encoding::E3M2,
|
||||
true>(in, scale, round);
|
||||
? fcbx::fp6_cvt_packedx32<__amd_fp16x32_storage_t, __amd_fp6x32_storage_t,
|
||||
__amd_fp16_storage_t, fcbx::Encoding::E5M10,
|
||||
fcbx::Encoding::E2M3, true>(in, scale, round)
|
||||
: fcbx::fp6_cvt_packedx32<__amd_fp16x32_storage_t, __amd_fp6x32_storage_t,
|
||||
__amd_fp16_storage_t, fcbx::Encoding::E5M10,
|
||||
fcbx::Encoding::E3M2, true>(in, scale, round);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1655,17 +1661,18 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp6x32_storage_t __amd_cvt_bf16x32_to_fp6x32
|
||||
const __amd_bf16x32_storage_t in, const __amd_fp6_interpretation_t interpret,
|
||||
const unsigned int round, const __amd_scale_t scale) {
|
||||
#if HIP_ENABLE_GFX950_OCP_BUILTINS
|
||||
return interpret == __AMD_OCP_E2M3
|
||||
? __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_bf16(in, round, __amd_scale_to_float(scale))
|
||||
: __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_bf16(in, round, __amd_scale_to_float(scale));
|
||||
return interpret == __AMD_OCP_E2M3 ? __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_bf16(
|
||||
in, round, __amd_scale_to_float(scale))
|
||||
: __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_bf16(
|
||||
in, round, __amd_scale_to_float(scale));
|
||||
#else
|
||||
return interpret == __AMD_OCP_E2M3
|
||||
? fcbx::fp6_cvt_packedx32<__amd_bf16x32_storage_t, __amd_fp6x32_storage_t,
|
||||
__amd_bf16_storage_t, fcbx::Encoding::E8M7, fcbx::Encoding::E2M3,
|
||||
true>(in, scale, round)
|
||||
: fcbx::fp6_cvt_packedx32<__amd_bf16x32_storage_t, __amd_fp6x32_storage_t,
|
||||
__amd_bf16_storage_t, fcbx::Encoding::E8M7, fcbx::Encoding::E3M2,
|
||||
true>(in, scale, round);
|
||||
? fcbx::fp6_cvt_packedx32<__amd_bf16x32_storage_t, __amd_fp6x32_storage_t,
|
||||
__amd_bf16_storage_t, fcbx::Encoding::E8M7,
|
||||
fcbx::Encoding::E2M3, true>(in, scale, round)
|
||||
: fcbx::fp6_cvt_packedx32<__amd_bf16x32_storage_t, __amd_fp6x32_storage_t,
|
||||
__amd_bf16_storage_t, fcbx::Encoding::E8M7,
|
||||
fcbx::Encoding::E3M2, true>(in, scale, round);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -2542,8 +2549,8 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp16x2_storage_t __amd_cvt_fp8x2_to_fp16x2(
|
||||
} u;
|
||||
u.fp8x2[0] = val;
|
||||
return interpret == __AMD_OCP_E4M3
|
||||
? __builtin_amdgcn_cvt_scalef32_pk_f16_fp8(u.ui32, __amd_scale_to_float(0), false)
|
||||
: __builtin_amdgcn_cvt_scalef32_pk_f16_bf8(u.ui32, __amd_scale_to_float(0), false);
|
||||
? __builtin_amdgcn_cvt_scalef32_pk_f16_fp8(u.ui32, __amd_scale_to_float(0), false)
|
||||
: __builtin_amdgcn_cvt_scalef32_pk_f16_bf8(u.ui32, __amd_scale_to_float(0), false);
|
||||
#else
|
||||
using namespace fcbx;
|
||||
__amd_fp16x2_storage_t ret;
|
||||
@@ -2573,9 +2580,10 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp8x2_storage_t __amd_cvt_fp16x2_to_fp8x2(
|
||||
__amd_shortx2_storage_t shortx2;
|
||||
__amd_fp8x2_storage_t fp8x2[2];
|
||||
} u{0};
|
||||
u.shortx2 = interpret == __AMD_OCP_E4M3
|
||||
? __builtin_amdgcn_cvt_scalef32_pk_fp8_f16(u.shortx2, val, __amd_scale_to_float(0), false)
|
||||
: __builtin_amdgcn_cvt_scalef32_pk_bf8_f16(u.shortx2, val, __amd_scale_to_float(0), false);
|
||||
u.shortx2 = interpret == __AMD_OCP_E4M3 ? __builtin_amdgcn_cvt_scalef32_pk_fp8_f16(
|
||||
u.shortx2, val, __amd_scale_to_float(0), false)
|
||||
: __builtin_amdgcn_cvt_scalef32_pk_bf8_f16(
|
||||
u.shortx2, val, __amd_scale_to_float(0), false);
|
||||
return u.fp8x2[0];
|
||||
#else
|
||||
using namespace fcbx;
|
||||
@@ -2783,8 +2791,8 @@ __OCP_FP_HOST_DEVICE_STATIC__ __amd_fp8_storage_t __amd_cvt_fp16_to_fp8_sr(
|
||||
#else
|
||||
using namespace fcbx;
|
||||
return interpret == __AMD_OCP_E4M3
|
||||
? from_float_sr<__amd_fp16_storage_t, Encoding::E4M3, true>(val, sr, 0)
|
||||
: from_float_sr<__amd_fp16_storage_t, Encoding::E5M2, true>(val, sr, 0);
|
||||
? from_float_sr<__amd_fp16_storage_t, Encoding::E4M3, true>(val, sr, 0)
|
||||
: from_float_sr<__amd_fp16_storage_t, Encoding::E5M2, true>(val, sr, 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -719,8 +719,8 @@ struct __hipext_ocp_fp6x32_e2m3 {
|
||||
}
|
||||
#endif
|
||||
|
||||
__OCP_FP_HOST_DEVICE__ __hipext_ocp_fp6x32_e2m3(const __amd_fp16x32_storage_t in,
|
||||
const __amd_scale_t scale)
|
||||
__OCP_FP_HOST_DEVICE__
|
||||
__hipext_ocp_fp6x32_e2m3(const __amd_fp16x32_storage_t in, const __amd_scale_t scale)
|
||||
#if HIP_ENABLE_GFX950_OCP_BUILTINS
|
||||
: __x(__builtin_amdgcn_cvt_scalef32_pk32_fp6_f16(in, __amd_scale_to_float(scale))){}
|
||||
#else
|
||||
@@ -742,8 +742,8 @@ struct __hipext_ocp_fp6x32_e2m3 {
|
||||
}
|
||||
#endif
|
||||
|
||||
__OCP_FP_HOST_DEVICE__
|
||||
__hipext_ocp_fp6x32_e2m3(const __amd_bf16x32_storage_t in, const __amd_scale_t scale)
|
||||
__OCP_FP_HOST_DEVICE__ __hipext_ocp_fp6x32_e2m3(const __amd_bf16x32_storage_t in,
|
||||
const __amd_scale_t scale)
|
||||
#if HIP_ENABLE_GFX950_OCP_BUILTINS
|
||||
: __x(__builtin_amdgcn_cvt_scalef32_pk32_fp6_bf16(in, __amd_scale_to_float(scale))){}
|
||||
#else
|
||||
@@ -832,8 +832,8 @@ struct __hipext_ocp_fp6x32_e3m2 {
|
||||
}
|
||||
#endif
|
||||
|
||||
__OCP_FP_HOST_DEVICE__ __hipext_ocp_fp6x32_e3m2(const __amd_fp16x32_storage_t in,
|
||||
const __amd_scale_t scale)
|
||||
__OCP_FP_HOST_DEVICE__
|
||||
__hipext_ocp_fp6x32_e3m2(const __amd_fp16x32_storage_t in, const __amd_scale_t scale)
|
||||
#if HIP_ENABLE_GFX950_OCP_BUILTINS
|
||||
: __x(__builtin_amdgcn_cvt_scalef32_pk32_bf6_f16(in, __amd_scale_to_float(scale))){}
|
||||
#else
|
||||
@@ -855,8 +855,8 @@ struct __hipext_ocp_fp6x32_e3m2 {
|
||||
}
|
||||
#endif
|
||||
|
||||
__OCP_FP_HOST_DEVICE__ __hipext_ocp_fp6x32_e3m2(const __amd_bf16x32_storage_t in,
|
||||
const __amd_scale_t scale)
|
||||
__OCP_FP_HOST_DEVICE__
|
||||
__hipext_ocp_fp6x32_e3m2(const __amd_bf16x32_storage_t in, const __amd_scale_t scale)
|
||||
#if HIP_ENABLE_GFX950_OCP_BUILTINS
|
||||
: __x(__builtin_amdgcn_cvt_scalef32_pk32_bf6_bf16(in, __amd_scale_to_float(scale))){}
|
||||
#else
|
||||
|
||||
@@ -793,11 +793,11 @@ __OCP_FP_HOST_DEVICE_STATIC__ OutType fp6_cvt_packedx32(InType in, int8_t scale
|
||||
uint32_t seed = 0) {
|
||||
// This is tightly coupled with the definitions of the amd_ocp_types
|
||||
constexpr bool in_float = std::is_same<InType, __amd_floatx32_storage_t>::value ||
|
||||
std::is_same<InType, __amd_fp16x32_storage_t>::value ||
|
||||
std::is_same<InType, __amd_bf16x32_storage_t>::value;
|
||||
std::is_same<InType, __amd_fp16x32_storage_t>::value ||
|
||||
std::is_same<InType, __amd_bf16x32_storage_t>::value;
|
||||
constexpr bool out_float = std::is_same<OutType, __amd_floatx32_storage_t>::value ||
|
||||
std::is_same<OutType, __amd_fp16x32_storage_t>::value ||
|
||||
std::is_same<OutType, __amd_bf16x32_storage_t>::value;
|
||||
std::is_same<OutType, __amd_fp16x32_storage_t>::value ||
|
||||
std::is_same<OutType, __amd_bf16x32_storage_t>::value;
|
||||
using other_type = std::conditional<in_float, OutType, InType>::type;
|
||||
|
||||
struct fp6x32_packed {
|
||||
|
||||
@@ -314,9 +314,8 @@ __device__ inline double unsafeAtomicMin(double* addr, double val) {
|
||||
* @return Original value contained in \p addr.
|
||||
*/
|
||||
__device__ inline float safeAtomicAdd(float* addr, float value) {
|
||||
#if defined(__gfx908__) || \
|
||||
((defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)) && \
|
||||
!__has_builtin(__hip_atomic_fetch_add))
|
||||
#if defined(__gfx908__) || ((defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)) && \
|
||||
!__has_builtin(__hip_atomic_fetch_add))
|
||||
// On gfx908, we can generate unsafe FP32 atomic add that does not follow all
|
||||
// IEEE rules when -munsafe-fp-atomics is passed. Do a CAS loop emulation instead.
|
||||
// On gfx90a, gfx942 and gfx950 if we do not have the __hip_atomic_fetch_add builtin, we
|
||||
|
||||
@@ -59,9 +59,9 @@ template <typename T, unsigned int n> struct HIP_vector_base;
|
||||
template <typename T, unsigned int rank> struct HIP_vector_type;
|
||||
|
||||
namespace hip_impl {
|
||||
template <typename T, unsigned int n>
|
||||
__attribute__((always_inline)) __HOST_DEVICE__ typename HIP_vector_base<T, n>::Native_vec_*
|
||||
get_native_pointer(HIP_vector_base<T, n>& base_vec) {
|
||||
template <typename T, unsigned int n> __attribute__((always_inline)) __HOST_DEVICE__
|
||||
typename HIP_vector_base<T, n>::Native_vec_*
|
||||
get_native_pointer(HIP_vector_base<T, n>& base_vec) {
|
||||
static_assert(sizeof(base_vec) == sizeof(typename HIP_vector_base<T, n>::Native_vec_));
|
||||
static_assert(__hip_internal::alignment_of<HIP_vector_base<T, n>>::value ==
|
||||
__hip_internal::alignment_of<typename HIP_vector_base<T, n>::Native_vec_>::value);
|
||||
@@ -78,9 +78,9 @@ get_native_pointer(const HIP_vector_base<T, n>& base_vec) {
|
||||
};
|
||||
} // Namespace hip_impl.
|
||||
|
||||
template <typename T, unsigned int n>
|
||||
__attribute__((always_inline)) __HOST_DEVICE__ typename HIP_vector_base<T, n>::Native_vec_&
|
||||
get_native_vector(HIP_vector_base<T, n>& base_vec) {
|
||||
template <typename T, unsigned int n> __attribute__((always_inline)) __HOST_DEVICE__
|
||||
typename HIP_vector_base<T, n>::Native_vec_&
|
||||
get_native_vector(HIP_vector_base<T, n>& base_vec) {
|
||||
return *hip_impl::get_native_pointer(base_vec);
|
||||
};
|
||||
|
||||
@@ -308,9 +308,8 @@ template <typename T, unsigned int rank> struct HIP_vector_type : public HIP_vec
|
||||
|
||||
__HOST_DEVICE__
|
||||
HIP_vector_type() = default;
|
||||
template <typename U,
|
||||
typename __hip_internal::enable_if<__hip_internal::is_convertible<U, T>::value>::type* =
|
||||
nullptr>
|
||||
template <typename U, typename __hip_internal::enable_if<
|
||||
__hip_internal::is_convertible<U, T>::value>::type* = nullptr>
|
||||
__HOST_DEVICE__ explicit constexpr HIP_vector_type(U x_) noexcept
|
||||
: HIP_vector_base<T, rank>{static_cast<T>(x_)} {}
|
||||
template < // TODO: constrain based on type as well.
|
||||
@@ -368,9 +367,8 @@ template <typename T, unsigned int rank> struct HIP_vector_type : public HIP_vec
|
||||
#endif
|
||||
return *this;
|
||||
}
|
||||
template <
|
||||
typename U,
|
||||
typename __hip_internal::enable_if<__hip_internal::is_convertible<U, T>{}>::type* = nullptr>
|
||||
template <typename U, typename __hip_internal::enable_if<
|
||||
__hip_internal::is_convertible<U, T>{}>::type* = nullptr>
|
||||
__HOST_DEVICE__ HIP_vector_type& operator+=(U x) noexcept {
|
||||
return *this += make_vector_type<T, rank>(x);
|
||||
}
|
||||
@@ -383,9 +381,8 @@ template <typename T, unsigned int rank> struct HIP_vector_type : public HIP_vec
|
||||
#endif
|
||||
return *this;
|
||||
}
|
||||
template <
|
||||
typename U,
|
||||
typename __hip_internal::enable_if<__hip_internal::is_convertible<U, T>{}>::type* = nullptr>
|
||||
template <typename U, typename __hip_internal::enable_if<
|
||||
__hip_internal::is_convertible<U, T>{}>::type* = nullptr>
|
||||
__HOST_DEVICE__ HIP_vector_type& operator-=(U x) noexcept {
|
||||
return *this -= make_vector_type<T, rank>(x);
|
||||
}
|
||||
@@ -404,9 +401,8 @@ template <typename T, unsigned int rank> struct HIP_vector_type : public HIP_vec
|
||||
return HIP_vector_type{x} *= y;
|
||||
}
|
||||
|
||||
template <
|
||||
typename U,
|
||||
typename __hip_internal::enable_if<__hip_internal::is_convertible<U, T>{}>::type* = nullptr>
|
||||
template <typename U, typename __hip_internal::enable_if<
|
||||
__hip_internal::is_convertible<U, T>{}>::type* = nullptr>
|
||||
__HOST_DEVICE__ HIP_vector_type& operator*=(U x) noexcept {
|
||||
return *this *= make_vector_type<T, rank>(x);
|
||||
}
|
||||
@@ -424,9 +420,8 @@ template <typename T, unsigned int rank> struct HIP_vector_type : public HIP_vec
|
||||
#endif
|
||||
return *this;
|
||||
}
|
||||
template <
|
||||
typename U,
|
||||
typename __hip_internal::enable_if<__hip_internal::is_convertible<U, T>{}>::type* = nullptr>
|
||||
template <typename U, typename __hip_internal::enable_if<
|
||||
__hip_internal::is_convertible<U, T>{}>::type* = nullptr>
|
||||
__HOST_DEVICE__ HIP_vector_type& operator/=(U x) noexcept {
|
||||
return *this /= make_vector_type<T, rank>(x);
|
||||
}
|
||||
@@ -576,8 +571,7 @@ __HOST_DEVICE__ inline constexpr HIP_vector_type<T, n> operator/(
|
||||
return make_vector_type<T, n>(x) /= y;
|
||||
}
|
||||
|
||||
template <typename T, unsigned int n>
|
||||
__HOST_DEVICE__ inline
|
||||
template <typename T, unsigned int n> __HOST_DEVICE__ inline
|
||||
#if __cplusplus >= 201402L && !defined(__HIPCC_RTC__)
|
||||
constexpr
|
||||
#endif
|
||||
|
||||
@@ -109,9 +109,8 @@ static __HOST_DEVICE__ __forceinline__ int __hipGetPixelAddr(int x, int format,
|
||||
* \param x [in] The coordinate where the value will be read out.
|
||||
* \param boundaryMode [in] The boundary mode is currently ignored.
|
||||
*/
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void surf1Dread(T* data, hipSurfaceObject_t surfObj, int x,
|
||||
int boundaryMode = hipBoundaryModeZero) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT;
|
||||
@@ -128,9 +127,8 @@ static __device__ __hip_img_chk__ void surf1Dread(T* data, hipSurfaceObject_t su
|
||||
* \param surfObj [in] The surface descriptor.
|
||||
* \param x [in] The coordinate where the data will be written.
|
||||
*/
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void surf1Dwrite(T data, hipSurfaceObject_t surfObj, int x) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
x = __hipGetPixelAddr(x, __ockl_image_channel_data_type_1D(i), __ockl_image_channel_order_1D(i));
|
||||
@@ -147,9 +145,8 @@ static __device__ __hip_img_chk__ void surf1Dwrite(T data, hipSurfaceObject_t su
|
||||
* \param x [in] The x coordinate where the value will be read out.
|
||||
* \param y [in] The y coordinate where the value will be read out.
|
||||
*/
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void surf2Dread(T* data, hipSurfaceObject_t surfObj, int x,
|
||||
int y) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
@@ -168,9 +165,8 @@ static __device__ __hip_img_chk__ void surf2Dread(T* data, hipSurfaceObject_t su
|
||||
* \param x [in] The x coordinate where the data will be written.
|
||||
* \param y [in] The y coordinate where the data will be written.
|
||||
*/
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void surf2Dwrite(T data, hipSurfaceObject_t surfObj, int x,
|
||||
int y) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
@@ -190,9 +186,8 @@ static __device__ __hip_img_chk__ void surf2Dwrite(T data, hipSurfaceObject_t su
|
||||
* \param y [in] The y coordinate where the value will be read out.
|
||||
* \param z [in] The z coordinate where the value will be read out.
|
||||
*/
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void surf3Dread(T* data, hipSurfaceObject_t surfObj, int x, int y,
|
||||
int z) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
@@ -212,9 +207,8 @@ static __device__ __hip_img_chk__ void surf3Dread(T* data, hipSurfaceObject_t su
|
||||
* \param y [in] The y coordinate where the data will be written.
|
||||
* \param z [in] The z coordinate where the data will be written.
|
||||
*/
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void surf3Dwrite(T data, hipSurfaceObject_t surfObj, int x, int y,
|
||||
int z) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
@@ -233,9 +227,8 @@ static __device__ __hip_img_chk__ void surf3Dwrite(T data, hipSurfaceObject_t su
|
||||
* \param x [in] The coordinate where the value will be read out.
|
||||
* \param layer [in] The layer index where the value will be read out.
|
||||
*/
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void surf1DLayeredread(T* data, hipSurfaceObject_t surfObj, int x,
|
||||
int layer) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
@@ -253,9 +246,8 @@ static __device__ __hip_img_chk__ void surf1DLayeredread(T* data, hipSurfaceObje
|
||||
* \param x [in] The x coordinate where the data will be written.
|
||||
* \param layer [in] The layer index where the data will be written.
|
||||
*/
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void surf1DLayeredwrite(T data, hipSurfaceObject_t surfObj, int x,
|
||||
int layer) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
@@ -274,9 +266,8 @@ static __device__ __hip_img_chk__ void surf1DLayeredwrite(T data, hipSurfaceObje
|
||||
* \param y [in] The y coordinate where the value will be read out.
|
||||
* \param layer [in] The layer index where the value will be read out.
|
||||
*/
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void surf2DLayeredread(T* data, hipSurfaceObject_t surfObj, int x,
|
||||
int y, int layer) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
@@ -296,9 +287,8 @@ static __device__ __hip_img_chk__ void surf2DLayeredread(T* data, hipSurfaceObje
|
||||
* \param y [in] The y coordinate where the data will be written.
|
||||
* \param layer [in] The layer index where the data will be written.
|
||||
*/
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void surf2DLayeredwrite(T data, hipSurfaceObject_t surfObj, int x,
|
||||
int y, int layer) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
@@ -318,9 +308,8 @@ static __device__ __hip_img_chk__ void surf2DLayeredwrite(T data, hipSurfaceObje
|
||||
* \param y [in] The y coordinate where the value will be read out.
|
||||
* \param face [in] The face index where the value will be read out.
|
||||
*/
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void surfCubemapread(T* data, hipSurfaceObject_t surfObj, int x,
|
||||
int y, int face) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
@@ -340,9 +329,8 @@ static __device__ __hip_img_chk__ void surfCubemapread(T* data, hipSurfaceObject
|
||||
* \param y [in] The y coordinate where the data will be written.
|
||||
* \param face [in] The face index where the data will be written.
|
||||
*/
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void surfCubemapwrite(T data, hipSurfaceObject_t surfObj, int x,
|
||||
int y, int face) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
@@ -363,9 +351,8 @@ static __device__ __hip_img_chk__ void surfCubemapwrite(T data, hipSurfaceObject
|
||||
* \param face [in] The face index where the value will be read out.
|
||||
* \param layer [in] The layer index where the data will be written.
|
||||
*/
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void surfCubemapLayeredread(T* data, hipSurfaceObject_t surfObj,
|
||||
int x, int y, int face, int layer) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
@@ -386,9 +373,8 @@ static __device__ __hip_img_chk__ void surfCubemapLayeredread(T* data, hipSurfac
|
||||
* \param face [in] The face index where the data will be written.
|
||||
* \param layer [in] The layer index where the data will be written.
|
||||
*/
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void surfCubemapLayeredwrite(T* data, hipSurfaceObject_t surfObj,
|
||||
int x, int y, int face, int layer) {
|
||||
__HIP_SURFACE_OBJECT_PARAMETERS_INIT
|
||||
|
||||
@@ -443,7 +443,7 @@ __device__ inline T __reduce_op_sync(MaskT mask, T val, BinaryOp op, WfReduce wf
|
||||
return backwardPermute(firstLane << 2, result);
|
||||
else {
|
||||
auto tmp = (static_cast<unsigned long long>(backwardPermute(firstLane << 2, result[1])) << 32) |
|
||||
static_cast<unsigned int>(backwardPermute(firstLane << 2, result[0]));
|
||||
static_cast<unsigned int>(backwardPermute(firstLane << 2, result[0]));
|
||||
return *reinterpret_cast<T*>(&tmp);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -130,12 +130,9 @@ inline hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSiz
|
||||
blockSizeLimit);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline hipError_t hipOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize,
|
||||
T kernel,
|
||||
size_t dynSharedMemPerBlk = 0,
|
||||
int blockSizeLimit = 0,
|
||||
unsigned int flags = 0) {
|
||||
template <class T> inline hipError_t hipOccupancyMaxPotentialBlockSizeWithFlags(
|
||||
int* gridSize, int* blockSize, T kernel, size_t dynSharedMemPerBlk = 0, int blockSizeLimit = 0,
|
||||
unsigned int flags = 0) {
|
||||
using namespace hip_impl;
|
||||
|
||||
hip_impl::hip_init();
|
||||
|
||||
@@ -51,11 +51,11 @@ namespace std { // TODO: these should be removed as soon as possible.
|
||||
#if (__cplusplus < 201406L)
|
||||
#if (__cplusplus < 201402L)
|
||||
template <bool cond, typename T = void> using enable_if_t = typename enable_if<cond, T>::type;
|
||||
template <bool cond, typename T, typename U>
|
||||
using conditional_t = typename conditional<cond, T, U>::type;
|
||||
template <bool cond, typename T, typename U> using conditional_t =
|
||||
typename conditional<cond, T, U>::type;
|
||||
template <typename T> using decay_t = typename decay<T>::type;
|
||||
template <FunctionalProcedure F, typename... Ts>
|
||||
using result_of_t = typename result_of<F(Ts...)>::type;
|
||||
template <FunctionalProcedure F, typename... Ts> using result_of_t =
|
||||
typename result_of<F(Ts...)>::type;
|
||||
template <typename T> using remove_reference_t = typename remove_reference<T>::type;
|
||||
#endif
|
||||
#endif
|
||||
@@ -67,8 +67,8 @@ template <typename...> using void_t_ = void;
|
||||
#if HIP_HAS_INVOCABLE
|
||||
template <typename, typename = void> struct is_callable_impl;
|
||||
|
||||
template <FunctionalProcedure F, typename... Ts>
|
||||
struct is_callable_impl<F(Ts...)> : std::is_invocable<F, Ts...> {};
|
||||
template <FunctionalProcedure F, typename... Ts> struct is_callable_impl<F(Ts...)>
|
||||
: std::is_invocable<F, Ts...> {};
|
||||
#elif HIP_HAS_RESULT_OF_SFINAE
|
||||
template <typename, typename = void> struct is_callable_impl : std::false_type {};
|
||||
|
||||
@@ -76,11 +76,10 @@ template <FunctionalProcedure F, typename... Ts>
|
||||
struct is_callable_impl<F(Ts...), void_t_<typename std::result_of<F(Ts...)>::type> >
|
||||
: std::true_type {};
|
||||
#else
|
||||
template <class Base, class T, class Derived>
|
||||
auto simple_invoke(T Base::* pmd, Derived&& ref) -> decltype(static_cast<Derived&&>(ref).*pmd);
|
||||
template <class Base, class T, class Derived> auto simple_invoke(T Base::* pmd, Derived&& ref)
|
||||
-> decltype(static_cast<Derived&&>(ref).*pmd);
|
||||
|
||||
template <class PMD, class Pointer>
|
||||
auto simple_invoke(PMD&& pmd, Pointer&& ptr)
|
||||
template <class PMD, class Pointer> auto simple_invoke(PMD&& pmd, Pointer&& ptr)
|
||||
-> decltype((*static_cast<Pointer&&>(ptr)).*static_cast<PMD&&>(pmd));
|
||||
|
||||
template <class Base, class T, class Derived>
|
||||
@@ -100,8 +99,8 @@ template <class Base, class T, class Derived, class... Args>
|
||||
auto simple_invoke(T Base::* pmf, const std::reference_wrapper<Derived>& ref, Args&&... args)
|
||||
-> decltype((ref.get().*pmf)(static_cast<Args&&>(args)...));
|
||||
|
||||
template <class F, class... Ts>
|
||||
auto simple_invoke(F&& f, Ts&&... xs) -> decltype(f(static_cast<Ts&&>(xs)...));
|
||||
template <class F, class... Ts> auto simple_invoke(F&& f, Ts&&... xs)
|
||||
-> decltype(f(static_cast<Ts&&>(xs)...));
|
||||
|
||||
template <typename, typename = void> struct is_callable_impl : std::false_type {};
|
||||
|
||||
|
||||
@@ -56,19 +56,19 @@ using lane_mask = unsigned long long int;
|
||||
namespace cooperative_groups {
|
||||
|
||||
/* Global scope */
|
||||
template <unsigned int size>
|
||||
using is_power_of_2 = __hip_internal::integral_constant<bool, (size & (size - 1)) == 0>;
|
||||
template <unsigned int size> using is_power_of_2 =
|
||||
__hip_internal::integral_constant<bool, (size & (size - 1)) == 0>;
|
||||
|
||||
template <unsigned int size>
|
||||
using is_valid_wavefront = __hip_internal::integral_constant<bool, size <= 64>;
|
||||
template <unsigned int size> using is_valid_wavefront =
|
||||
__hip_internal::integral_constant<bool, size <= 64>;
|
||||
|
||||
template <unsigned int size>
|
||||
using is_valid_tile_size = __hip_internal::integral_constant<
|
||||
bool, is_power_of_2<size>::value && is_valid_wavefront<size>::value>;
|
||||
template <unsigned int size> using is_valid_tile_size =
|
||||
__hip_internal::integral_constant<bool, is_power_of_2<size>::value &&
|
||||
is_valid_wavefront<size>::value>;
|
||||
|
||||
template <typename T>
|
||||
using is_valid_type = __hip_internal::integral_constant<
|
||||
bool, __hip_internal::is_integral<T>::value || __hip_internal::is_floating_point<T>::value>;
|
||||
template <typename T> using is_valid_type =
|
||||
__hip_internal::integral_constant<bool, __hip_internal::is_integral<T>::value ||
|
||||
__hip_internal::is_floating_point<T>::value>;
|
||||
|
||||
namespace internal {
|
||||
|
||||
|
||||
@@ -8101,9 +8101,10 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) {
|
||||
break;
|
||||
// hipDeviceGetPCIBusId[('char*', 'pciBusId'), ('int', 'len'), ('int', 'device')]
|
||||
case HIP_API_ID_hipDeviceGetPCIBusId:
|
||||
data->args.hipDeviceGetPCIBusId.pciBusId = (data->args.hipDeviceGetPCIBusId.pciBusId)
|
||||
? strdup(data->args.hipDeviceGetPCIBusId.pciBusId)
|
||||
: NULL;
|
||||
data->args.hipDeviceGetPCIBusId.pciBusId =
|
||||
(data->args.hipDeviceGetPCIBusId.pciBusId)
|
||||
? strdup(data->args.hipDeviceGetPCIBusId.pciBusId)
|
||||
: NULL;
|
||||
break;
|
||||
// hipDeviceGetSharedMemConfig[('hipSharedMemConfig*', 'pConfig')]
|
||||
case HIP_API_ID_hipDeviceGetSharedMemConfig:
|
||||
@@ -8991,9 +8992,10 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) {
|
||||
if (data->args.hipGraphInstantiate.pErrorNode)
|
||||
data->args.hipGraphInstantiate.pErrorNode__val =
|
||||
*(data->args.hipGraphInstantiate.pErrorNode);
|
||||
data->args.hipGraphInstantiate.pLogBuffer = (data->args.hipGraphInstantiate.pLogBuffer)
|
||||
? strdup(data->args.hipGraphInstantiate.pLogBuffer)
|
||||
: NULL;
|
||||
data->args.hipGraphInstantiate.pLogBuffer =
|
||||
(data->args.hipGraphInstantiate.pLogBuffer)
|
||||
? strdup(data->args.hipGraphInstantiate.pLogBuffer)
|
||||
: NULL;
|
||||
break;
|
||||
// hipGraphInstantiateWithFlags[('hipGraphExec_t*', 'pGraphExec'), ('hipGraph_t', 'graph'),
|
||||
// ('unsigned long long', 'flags')]
|
||||
@@ -15959,9 +15961,8 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da
|
||||
oss, data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.blockSize);
|
||||
oss << ", dynSharedMemPerBlk=";
|
||||
roctracer::hip_support::detail::operator<<(
|
||||
oss,
|
||||
data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
|
||||
.dynSharedMemPerBlk);
|
||||
oss, data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
|
||||
.dynSharedMemPerBlk);
|
||||
oss << ", flags=";
|
||||
roctracer::hip_support::detail::operator<<(
|
||||
oss, data->args.hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags.flags);
|
||||
|
||||
@@ -114,11 +114,11 @@ template <typename __T, typename __U> struct is_same : public false_type {};
|
||||
template <typename __T> struct is_same<__T, __T> : public true_type {};
|
||||
|
||||
template <typename _Tp, bool = is_arithmetic<_Tp>::value> struct is_signed : public false_type {};
|
||||
template <typename _Tp>
|
||||
struct is_signed<_Tp, true> : public true_or_false_type<_Tp(-1) < _Tp(0)> {};
|
||||
template <typename _Tp> struct is_signed<_Tp, true> : public true_or_false_type<_Tp(-1) < _Tp(0)> {
|
||||
};
|
||||
|
||||
template <class T>
|
||||
auto test_returnable(int) -> decltype(void(static_cast<T (*)()>(nullptr)), true_type{});
|
||||
template <class T> auto test_returnable(int)
|
||||
-> decltype(void(static_cast<T (*)()>(nullptr)), true_type{});
|
||||
template <class> auto test_returnable(...) -> false_type;
|
||||
|
||||
template <class T> struct type_identity {
|
||||
@@ -139,8 +139,7 @@ template <class T> struct add_rvalue_reference : decltype(try_add_rvalue_referen
|
||||
|
||||
template <typename T> typename add_rvalue_reference<T>::type declval() noexcept;
|
||||
|
||||
template <class From, class To>
|
||||
auto test_implicitly_convertible(int)
|
||||
template <class From, class To> auto test_implicitly_convertible(int)
|
||||
-> decltype(void(declval<void (&)(To)>()(declval<From>())), true_type{});
|
||||
|
||||
template <class, class> auto test_implicitly_convertible(...) -> false_type;
|
||||
@@ -160,12 +159,10 @@ template <class T> struct remove_cv<const volatile T> {
|
||||
|
||||
template <class T> struct is_void : public is_same<void, typename remove_cv<T>::type> {};
|
||||
|
||||
template <class From, class To>
|
||||
struct is_convertible
|
||||
: public integral_constant<bool,
|
||||
(decltype(test_returnable<To>(0))::value &&
|
||||
decltype(test_implicitly_convertible<From, To>(0))::value) ||
|
||||
(is_void<From>::value && is_void<To>::value)> {};
|
||||
template <class From, class To> struct is_convertible
|
||||
: public integral_constant<bool, (decltype(test_returnable<To>(0))::value &&
|
||||
decltype(test_implicitly_convertible<From, To>(0))::value) ||
|
||||
(is_void<From>::value && is_void<To>::value)> {};
|
||||
|
||||
template <typename _CharT> struct char_traits;
|
||||
template <typename _CharT, typename _Traits = char_traits<_CharT>> class basic_istream;
|
||||
@@ -173,8 +170,8 @@ template <typename _CharT, typename _Traits = char_traits<_CharT>> class basic_o
|
||||
typedef basic_istream<char> istream;
|
||||
typedef basic_ostream<char> ostream;
|
||||
|
||||
template <typename _Tp>
|
||||
struct is_standard_layout : public integral_constant<bool, __is_standard_layout(_Tp)> {};
|
||||
template <typename _Tp> struct is_standard_layout
|
||||
: public integral_constant<bool, __is_standard_layout(_Tp)> {};
|
||||
|
||||
template <typename _Tp> struct is_trivial : public integral_constant<bool, __is_trivial(_Tp)> {};
|
||||
|
||||
@@ -195,15 +192,15 @@ template <typename T, T... Ints> struct integer_sequence {
|
||||
|
||||
template <size_t... Ints> using index_sequence = integer_sequence<size_t, Ints...>;
|
||||
|
||||
template <size_t _hip_N, size_t... Ints>
|
||||
struct make_index_sequence_impl : make_index_sequence_impl<_hip_N - 1, _hip_N - 1, Ints...> {};
|
||||
template <size_t _hip_N, size_t... Ints> struct make_index_sequence_impl
|
||||
: make_index_sequence_impl<_hip_N - 1, _hip_N - 1, Ints...> {};
|
||||
|
||||
template <size_t... Ints> struct make_index_sequence_impl<0, Ints...> {
|
||||
using type = index_sequence<Ints...>;
|
||||
};
|
||||
|
||||
template <size_t _hip_N>
|
||||
using make_index_sequence = typename make_index_sequence_impl<_hip_N>::type;
|
||||
template <size_t _hip_N> using make_index_sequence =
|
||||
typename make_index_sequence_impl<_hip_N>::type;
|
||||
|
||||
template <size_t... Ints>
|
||||
constexpr index_sequence<Ints...> make_index_sequence_value(index_sequence<Ints...>) {
|
||||
|
||||
@@ -61,9 +61,9 @@ template <typename C, typename D> RAII_guard<C, D> make_RAII_guard(const C& ctor
|
||||
return RAII_guard<C, D>{ctor, std::move(dtor)};
|
||||
}
|
||||
|
||||
template <FunctionalProcedure F, typename... Ts>
|
||||
using is_new_grid_launch_t = typename std::conditional<is_callable<F(Ts...)>{}, New_grid_launch_tag,
|
||||
Old_grid_launch_tag>::type;
|
||||
template <FunctionalProcedure F, typename... Ts> using is_new_grid_launch_t =
|
||||
typename std::conditional<is_callable<F(Ts...)>{}, New_grid_launch_tag,
|
||||
Old_grid_launch_tag>::type;
|
||||
} // namespace
|
||||
|
||||
// TODO: - dispatch rank should be derived from the domain dimensions passed
|
||||
|
||||
@@ -37,8 +37,8 @@ THE SOFTWARE.
|
||||
(void)s;
|
||||
|
||||
template <typename T> struct __hip_is_tex_surf_scalar_channel_type {
|
||||
static constexpr bool value = __hip_internal::is_same<T, char>::value ||
|
||||
__hip_internal::is_same<T, unsigned char>::value ||
|
||||
static constexpr bool value =
|
||||
__hip_internal::is_same<T, char>::value || __hip_internal::is_same<T, unsigned char>::value ||
|
||||
__hip_internal::is_same<T, short>::value ||
|
||||
__hip_internal::is_same<T, unsigned short>::value || __hip_internal::is_same<T, int>::value ||
|
||||
__hip_internal::is_same<T, unsigned int>::value || __hip_internal::is_same<T, float>::value;
|
||||
@@ -51,12 +51,12 @@ template <typename T> struct __hip_is_tex_surf_channel_type {
|
||||
template <typename T, unsigned int rank>
|
||||
struct __hip_is_tex_surf_channel_type<HIP_vector_type<T, rank>> {
|
||||
static constexpr bool value = __hip_is_tex_surf_scalar_channel_type<T>::value &&
|
||||
((rank == 1) || (rank == 2) || (rank == 4));
|
||||
((rank == 1) || (rank == 2) || (rank == 4));
|
||||
};
|
||||
|
||||
template <typename T> struct __hip_is_tex_normalized_channel_type {
|
||||
static constexpr bool value = __hip_internal::is_same<T, char>::value ||
|
||||
__hip_internal::is_same<T, unsigned char>::value ||
|
||||
static constexpr bool value =
|
||||
__hip_internal::is_same<T, char>::value || __hip_internal::is_same<T, unsigned char>::value ||
|
||||
__hip_internal::is_same<T, short>::value || __hip_internal::is_same<T, unsigned short>::value;
|
||||
};
|
||||
|
||||
@@ -73,8 +73,7 @@ template <typename T, hipTextureReadMode readMode, typename Enable = void> struc
|
||||
/*
|
||||
* Map from device function return U to scalar texture type T
|
||||
*/
|
||||
template <typename T, typename U>
|
||||
__forceinline__ __device__
|
||||
template <typename T, typename U> __forceinline__ __device__
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_scalar_channel_type<T>::value,
|
||||
const T>::type
|
||||
__hipMapFrom(const U& u) {
|
||||
@@ -96,8 +95,7 @@ __forceinline__ __device__
|
||||
/*
|
||||
* Map from device function return U to vector texture type T
|
||||
*/
|
||||
template <typename T, typename U>
|
||||
__forceinline__ __device__ typename __hip_internal::enable_if<
|
||||
template <typename T, typename U> __forceinline__ __device__ typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_scalar_channel_type<typename T::value_type>::value, const T>::type
|
||||
__hipMapFrom(const U& u) {
|
||||
if constexpr (sizeof(typename T::value_type) < sizeof(float)) {
|
||||
@@ -118,8 +116,7 @@ __hipMapFrom(const U& u) {
|
||||
/*
|
||||
* Map from scalar texture type T to device function input U
|
||||
*/
|
||||
template <typename U, typename T>
|
||||
__forceinline__ __device__
|
||||
template <typename U, typename T> __forceinline__ __device__
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_scalar_channel_type<T>::value,
|
||||
const U>::type
|
||||
__hipMapTo(const T& t) {
|
||||
@@ -143,8 +140,7 @@ __forceinline__ __device__
|
||||
/*
|
||||
* Map from vector texture type T to device function input U
|
||||
*/
|
||||
template <typename U, typename T>
|
||||
__forceinline__ __device__ typename __hip_internal::enable_if<
|
||||
template <typename U, typename T> __forceinline__ __device__ typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_scalar_channel_type<typename T::value_type>::value, const U>::type
|
||||
__hipMapTo(const T& t) {
|
||||
if constexpr (sizeof(typename T::value_type) < sizeof(float)) {
|
||||
@@ -164,18 +160,16 @@ __hipMapTo(const T& t) {
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, hipTextureReadMode readMode>
|
||||
using __hip_tex_ret_t = typename __hip_tex_ret<T, readMode, bool>::type;
|
||||
template <typename T, hipTextureReadMode readMode> using __hip_tex_ret_t =
|
||||
typename __hip_tex_ret<T, readMode, bool>::type;
|
||||
|
||||
template <typename T>
|
||||
struct __hip_tex_ret<
|
||||
template <typename T> struct __hip_tex_ret<
|
||||
T, hipReadModeElementType,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value, bool>::type> {
|
||||
using type = T;
|
||||
};
|
||||
|
||||
template <typename T, unsigned int rank>
|
||||
struct __hip_tex_ret<
|
||||
template <typename T, unsigned int rank> struct __hip_tex_ret<
|
||||
HIP_vector_type<T, rank>, hipReadModeElementType,
|
||||
typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<HIP_vector_type<T, rank>>::value, bool>::type> {
|
||||
@@ -189,8 +183,7 @@ struct __hip_tex_ret<T, hipReadModeNormalizedFloat,
|
||||
using type = float;
|
||||
};
|
||||
|
||||
template <typename T, unsigned int rank>
|
||||
struct __hip_tex_ret<
|
||||
template <typename T, unsigned int rank> struct __hip_tex_ret<
|
||||
HIP_vector_type<T, rank>, hipReadModeNormalizedFloat,
|
||||
typename __hip_internal::enable_if<
|
||||
__hip_is_tex_normalized_channel_type<HIP_vector_type<T, rank>>::value, bool>::type> {
|
||||
@@ -421,18 +414,16 @@ struct __hip_tex2dgather_ret {
|
||||
static_assert(__hip_internal::is_same<Enable, void>::value, "Invalid channel type!");
|
||||
};
|
||||
|
||||
template <typename T, hipTextureReadMode readMode>
|
||||
using __hip_tex2dgather_ret_t = typename __hip_tex2dgather_ret<T, readMode, bool>::type;
|
||||
template <typename T, hipTextureReadMode readMode> using __hip_tex2dgather_ret_t =
|
||||
typename __hip_tex2dgather_ret<T, readMode, bool>::type;
|
||||
|
||||
template <typename T>
|
||||
struct __hip_tex2dgather_ret<
|
||||
template <typename T> struct __hip_tex2dgather_ret<
|
||||
T, hipReadModeElementType,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value, bool>::type> {
|
||||
using type = HIP_vector_type<T, 4>;
|
||||
};
|
||||
|
||||
template <typename T, unsigned int rank>
|
||||
struct __hip_tex2dgather_ret<
|
||||
template <typename T, unsigned int rank> struct __hip_tex2dgather_ret<
|
||||
HIP_vector_type<T, rank>, hipReadModeElementType,
|
||||
typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<HIP_vector_type<T, rank>>::value, bool>::type> {
|
||||
|
||||
@@ -37,41 +37,36 @@ THE SOFTWARE.
|
||||
unsigned int ADDRESS_SPACE_CONSTANT* s = i + HIP_SAMPLER_OBJECT_OFFSET_DWORD; \
|
||||
(void)s;
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex1Dfetch(hipTextureObject_t textureObject, int x) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
auto tmp = __ockl_image_load_1Db(i, x);
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex1Dfetch(T* ptr, hipTextureObject_t textureObject, int x) {
|
||||
*ptr = tex1Dfetch<T>(textureObject, x);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex1D(hipTextureObject_t textureObject, float x) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
auto tmp = __ockl_image_sample_1D(i, s, x);
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex1D(T* ptr, hipTextureObject_t textureObject, float x) {
|
||||
*ptr = tex1D<T>(textureObject, x);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex2D(hipTextureObject_t textureObject, float x, float y) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
float2 coords{x, y};
|
||||
@@ -79,17 +74,15 @@ static __device__ __hip_img_chk__ T tex2D(hipTextureObject_t textureObject, floa
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex2D(T* ptr, hipTextureObject_t textureObject, float x,
|
||||
float y) {
|
||||
*ptr = tex2D<T>(textureObject, x, y);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex3D(hipTextureObject_t textureObject, float x, float y,
|
||||
float z) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
@@ -98,17 +91,15 @@ static __device__ __hip_img_chk__ T tex3D(hipTextureObject_t textureObject, floa
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex3D(T* ptr, hipTextureObject_t textureObject, float x,
|
||||
float y, float z) {
|
||||
*ptr = tex3D<T>(textureObject, x, y, z);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex1DLayered(hipTextureObject_t textureObject, float x,
|
||||
int layer) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
@@ -117,17 +108,15 @@ static __device__ __hip_img_chk__ T tex1DLayered(hipTextureObject_t textureObjec
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex1DLayered(T* ptr, hipTextureObject_t textureObject,
|
||||
float x, int layer) {
|
||||
*ptr = tex1DLayered<T>(textureObject, x, layer);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex2DLayered(hipTextureObject_t textureObject, float x, float y,
|
||||
int layer) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
@@ -136,17 +125,15 @@ static __device__ __hip_img_chk__ T tex2DLayered(hipTextureObject_t textureObjec
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex2DLayered(T* ptr, hipTextureObject_t textureObject,
|
||||
float x, float y, int layer) {
|
||||
*ptr = tex1DLayered<T>(textureObject, x, y, layer);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T texCubemap(hipTextureObject_t textureObject, float x, float y,
|
||||
float z) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
@@ -155,17 +142,15 @@ static __device__ __hip_img_chk__ T texCubemap(hipTextureObject_t textureObject,
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void texCubemap(T* ptr, hipTextureObject_t textureObject, float x,
|
||||
float y, float z) {
|
||||
*ptr = texCubemap<T>(textureObject, x, y, z);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T texCubemapLayered(hipTextureObject_t textureObject, float x,
|
||||
float y, float z, int layer) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
@@ -174,17 +159,15 @@ static __device__ __hip_img_chk__ T texCubemapLayered(hipTextureObject_t texture
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void texCubemapLayered(T* ptr, hipTextureObject_t textureObject,
|
||||
float x, float y, float z, int layer) {
|
||||
*ptr = texCubemapLayered<T>(textureObject, x, y, z, layer);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex2Dgather(hipTextureObject_t textureObject, float x, float y,
|
||||
int comp = 0) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
@@ -214,17 +197,15 @@ static __device__ __hip_img_chk__ T tex2Dgather(hipTextureObject_t textureObject
|
||||
return {};
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex2Dgather(T* ptr, hipTextureObject_t textureObject,
|
||||
float x, float y, int comp = 0) {
|
||||
*ptr = texCubemapLayered<T>(textureObject, x, y, comp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex1DLod(hipTextureObject_t textureObject, float x,
|
||||
float level) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
@@ -232,17 +213,15 @@ static __device__ __hip_img_chk__ T tex1DLod(hipTextureObject_t textureObject, f
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex1DLod(T* ptr, hipTextureObject_t textureObject, float x,
|
||||
float level) {
|
||||
*ptr = tex1DLod<T>(textureObject, x, level);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex2DLod(hipTextureObject_t textureObject, float x, float y,
|
||||
float level) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
@@ -251,17 +230,15 @@ static __device__ __hip_img_chk__ T tex2DLod(hipTextureObject_t textureObject, f
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex2DLod(T* ptr, hipTextureObject_t textureObject, float x,
|
||||
float y, float level) {
|
||||
*ptr = tex2DLod<T>(textureObject, x, y, level);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex3DLod(hipTextureObject_t textureObject, float x, float y,
|
||||
float z, float level) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
@@ -270,17 +247,15 @@ static __device__ __hip_img_chk__ T tex3DLod(hipTextureObject_t textureObject, f
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex3DLod(T* ptr, hipTextureObject_t textureObject, float x,
|
||||
float y, float z, float level) {
|
||||
*ptr = tex3DLod<T>(textureObject, x, y, z, level);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex1DLayeredLod(hipTextureObject_t textureObject, float x,
|
||||
int layer, float level) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT;
|
||||
@@ -290,17 +265,15 @@ static __device__ __hip_img_chk__ T tex1DLayeredLod(hipTextureObject_t textureOb
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex1DLayeredLod(T* ptr, hipTextureObject_t textureObject,
|
||||
float x, int layer, float level) {
|
||||
*ptr = tex1DLayeredLod<T>(textureObject, x, layer, level);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex2DLayeredLod(hipTextureObject_t textureObject, float x,
|
||||
float y, int layer, float level) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT;
|
||||
@@ -310,17 +283,15 @@ static __device__ __hip_img_chk__ T tex2DLayeredLod(hipTextureObject_t textureOb
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex2DLayeredLod(T* ptr, hipTextureObject_t textureObject,
|
||||
float x, float y, int layer, float level) {
|
||||
*ptr = tex2DLayeredLod<T>(textureObject, x, y, layer, level);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T texCubemapLod(hipTextureObject_t textureObject, float x,
|
||||
float y, float z, float level) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
@@ -329,17 +300,15 @@ static __device__ __hip_img_chk__ T texCubemapLod(hipTextureObject_t textureObje
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void texCubemapLod(T* ptr, hipTextureObject_t textureObject,
|
||||
float x, float y, float z, float level) {
|
||||
*ptr = texCubemapLod<T>(textureObject, x, y, z, level);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T texCubemapGrad(hipTextureObject_t textureObject, float x,
|
||||
float y, float z, float4 dPdx, float4 dPdy) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT;
|
||||
@@ -355,18 +324,16 @@ static __device__ __hip_img_chk__ T texCubemapGrad(hipTextureObject_t textureObj
|
||||
return {};
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void texCubemapGrad(T* ptr, hipTextureObject_t textureObject,
|
||||
float x, float y, float z, float4 dPdx,
|
||||
float4 dPdy) {
|
||||
*ptr = texCubemapGrad<T>(textureObject, x, y, z, dPdx, dPdy);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T texCubemapLayeredLod(hipTextureObject_t textureObject, float x,
|
||||
float y, float z, int layer, float level) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
@@ -375,9 +342,8 @@ static __device__ __hip_img_chk__ T texCubemapLayeredLod(hipTextureObject_t text
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void texCubemapLayeredLod(T* ptr,
|
||||
hipTextureObject_t textureObject,
|
||||
float x, float y, float z, int layer,
|
||||
@@ -385,9 +351,8 @@ static __device__ __hip_img_chk__ void texCubemapLayeredLod(T* ptr,
|
||||
*ptr = texCubemapLayeredLod<T>(textureObject, x, y, z, layer, level);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex1DGrad(hipTextureObject_t textureObject, float x, float dPdx,
|
||||
float dPdy) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
@@ -395,17 +360,15 @@ static __device__ __hip_img_chk__ T tex1DGrad(hipTextureObject_t textureObject,
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex1DGrad(T* ptr, hipTextureObject_t textureObject, float x,
|
||||
float dPdx, float dPdy) {
|
||||
*ptr = tex1DGrad<T>(textureObject, x, dPdx, dPdy);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex2DGrad(hipTextureObject_t textureObject, float x, float y,
|
||||
float2 dPdx, float2 dPdy) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
@@ -415,17 +378,15 @@ static __device__ __hip_img_chk__ T tex2DGrad(hipTextureObject_t textureObject,
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex2DGrad(T* ptr, hipTextureObject_t textureObject, float x,
|
||||
float y, float2 dPdx, float2 dPdy) {
|
||||
*ptr = tex2DGrad<T>(textureObject, x, y, dPdx, dPdy);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex3DGrad(hipTextureObject_t textureObject, float x, float y,
|
||||
float z, float4 dPdx, float4 dPdy) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT;
|
||||
@@ -438,17 +399,15 @@ static __device__ __hip_img_chk__ T tex3DGrad(hipTextureObject_t textureObject,
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex3DGrad(T* ptr, hipTextureObject_t textureObject, float x,
|
||||
float y, float z, float4 dPdx, float4 dPdy) {
|
||||
*ptr = tex3DGrad<T>(textureObject, x, y, z, dPdx, dPdy);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex1DLayeredGrad(hipTextureObject_t textureObject, float x,
|
||||
int layer, float dPdx, float dPdy) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
@@ -457,18 +416,16 @@ static __device__ __hip_img_chk__ T tex1DLayeredGrad(hipTextureObject_t textureO
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex1DLayeredGrad(T* ptr, hipTextureObject_t textureObject,
|
||||
float x, int layer, float dPdx,
|
||||
float dPdy) {
|
||||
*ptr = tex1DLayeredGrad<T>(textureObject, x, layer, dPdx, dPdy);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T tex2DLayeredGrad(hipTextureObject_t textureObject, float x,
|
||||
float y, int layer, float2 dPdx, float2 dPdy) {
|
||||
TEXTURE_OBJECT_PARAMETERS_INIT
|
||||
@@ -478,18 +435,16 @@ static __device__ __hip_img_chk__ T tex2DLayeredGrad(hipTextureObject_t textureO
|
||||
return __hipMapFrom<T>(tmp);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void tex2DLayeredGrad(T* ptr, hipTextureObject_t textureObject,
|
||||
float x, float y, int layer, float2 dPdx,
|
||||
float2 dPdy) {
|
||||
*ptr = tex2DLayeredGrad<T>(textureObject, x, y, layer, dPdx, dPdy);
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ T texCubemapLayeredGrad(hipTextureObject_t textureObject, float x,
|
||||
float y, float z, int layer, float4 dPdx,
|
||||
float4 dPdy) {
|
||||
@@ -507,9 +462,8 @@ static __device__ __hip_img_chk__ T texCubemapLayeredGrad(hipTextureObject_t tex
|
||||
return {};
|
||||
}
|
||||
|
||||
template <
|
||||
typename T,
|
||||
typename __hip_internal::enable_if<__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
template <typename T, typename __hip_internal::enable_if<
|
||||
__hip_is_tex_surf_channel_type<T>::value>::type* = nullptr>
|
||||
static __device__ __hip_img_chk__ void texCubemapLayeredGrad(T* ptr,
|
||||
hipTextureObject_t textureObject,
|
||||
float x, float y, float z, int layer,
|
||||
|
||||
@@ -156,8 +156,8 @@ bool isCodeObjectCompatibleWithDevice(std::string co_triple_target_id,
|
||||
static inline unsigned int getGenericVersion(const void* image) {
|
||||
const Elf64_Ehdr* ehdr = reinterpret_cast<const Elf64_Ehdr*>(image);
|
||||
return ehdr->e_ident[EI_ABIVERSION] == ELFABIVERSION_AMDGPU_HSA_V6
|
||||
? ((ehdr->e_flags & EF_AMDGPU_GENERIC_VERSION) >> EF_AMDGPU_GENERIC_VERSION_OFFSET)
|
||||
: 0;
|
||||
? ((ehdr->e_flags & EF_AMDGPU_GENERIC_VERSION) >> EF_AMDGPU_GENERIC_VERSION_OFFSET)
|
||||
: 0;
|
||||
}
|
||||
|
||||
static inline bool isGenericTarget(const void* image) {
|
||||
@@ -178,10 +178,9 @@ bool UnbundleBitCode(const std::vector<char>& bundled_llvm_bitcode, const std::s
|
||||
const void* data = reinterpret_cast<const void*>(bundled_llvm_bitcode_s.c_str());
|
||||
const auto obheader = reinterpret_cast<const __ClangOffloadBundleHeader*>(data);
|
||||
const auto* desc = &obheader->desc[0];
|
||||
for (uint64_t idx = 0; idx < obheader->numOfCodeObjects; ++idx,
|
||||
desc = reinterpret_cast<const __ClangOffloadBundleInfo*>(
|
||||
reinterpret_cast<uintptr_t>(&desc->bundleEntryId[0]) +
|
||||
desc->bundleEntryIdSize)) {
|
||||
for (uint64_t idx = 0; idx < obheader->numOfCodeObjects;
|
||||
++idx, desc = reinterpret_cast<const __ClangOffloadBundleInfo*>(
|
||||
reinterpret_cast<uintptr_t>(&desc->bundleEntryId[0]) + desc->bundleEntryIdSize)) {
|
||||
const void* image =
|
||||
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(obheader) + desc->offset);
|
||||
const size_t image_size = desc->size;
|
||||
@@ -736,9 +735,8 @@ bool demangleName(const std::string& mangledName, std::string& demangledName) {
|
||||
|
||||
demangledName.resize(demangled_size);
|
||||
|
||||
if (AMD_COMGR_STATUS_SUCCESS !=
|
||||
amd::Comgr::get_data(demangled_data, &demangled_size,
|
||||
const_cast<char*>(demangledName.data()))) {
|
||||
if (AMD_COMGR_STATUS_SUCCESS != amd::Comgr::get_data(demangled_data, &demangled_size,
|
||||
const_cast<char*>(demangledName.data()))) {
|
||||
amd::Comgr::release_data(mangled_data);
|
||||
amd::Comgr::release_data(demangled_data);
|
||||
return false;
|
||||
|
||||
@@ -135,7 +135,7 @@ hipError_t Event::elapsedTime(Event& eStop, float& ms) {
|
||||
command->awaitCompletion();
|
||||
ms = static_cast<float>(static_cast<int64_t>(command->event().profilingInfo().end_) -
|
||||
time(false)) /
|
||||
1000000.f;
|
||||
1000000.f;
|
||||
command->release();
|
||||
} else {
|
||||
// Note: with direct dispatch eStop.ready() relies on HW event, but CPU status can be delayed.
|
||||
@@ -210,7 +210,8 @@ hipError_t Event::streamWait(hip::Stream* stream, uint flags) {
|
||||
hipError_t Event::recordCommand(amd::Command*& command, amd::HostQueue* stream, uint32_t ext_flags,
|
||||
bool batch_flush) {
|
||||
if (command == nullptr) {
|
||||
int32_t releaseFlags = ((ext_flags == 0) ? flags_ : ext_flags) &
|
||||
int32_t releaseFlags =
|
||||
((ext_flags == 0) ? flags_ : ext_flags) &
|
||||
(hipEventReleaseToDevice | hipEventReleaseToSystem | hipEventDisableSystemFence);
|
||||
if (releaseFlags & hipEventDisableSystemFence) {
|
||||
releaseFlags = amd::Device::kCacheStateIgnore;
|
||||
@@ -269,8 +270,8 @@ bool isValid(hipEvent_t event) {
|
||||
// ================================================================================================
|
||||
hipError_t ihipEventCreateWithFlags(hipEvent_t* event, unsigned flags) {
|
||||
unsigned supportedFlags = hipEventDefault | hipEventBlockingSync | hipEventDisableTiming |
|
||||
hipEventReleaseToDevice | hipEventReleaseToSystem | hipEventInterprocess |
|
||||
hipEventDisableSystemFence;
|
||||
hipEventReleaseToDevice | hipEventReleaseToSystem |
|
||||
hipEventInterprocess | hipEventDisableSystemFence;
|
||||
|
||||
const unsigned releaseFlags =
|
||||
(hipEventReleaseToDevice | hipEventReleaseToSystem | hipEventDisableSystemFence);
|
||||
@@ -284,7 +285,7 @@ hipError_t ihipEventCreateWithFlags(hipEvent_t* event, unsigned flags) {
|
||||
}
|
||||
return bitcount;
|
||||
}(flags & releaseFlags) > 1) ||
|
||||
((flags & hipEventInterprocess) && !(flags & hipEventDisableTiming));
|
||||
((flags & hipEventInterprocess) && !(flags & hipEventDisableTiming));
|
||||
if (!illegalFlags) {
|
||||
hip::Event* e = nullptr;
|
||||
if (flags & hipEventInterprocess) {
|
||||
|
||||
@@ -37,10 +37,9 @@ template <typename comgr_T> class ComgrUniqueHandle {
|
||||
// constructor which takes ownership of a correctly initialzed handle
|
||||
ComgrUniqueHandle(comgr_T& handle) : comgr_obj_(handle) { handle = {0}; };
|
||||
|
||||
template <typename T = comgr_T,
|
||||
std::enable_if_t<std::is_same_v<T, amd_comgr_data_set_t> ||
|
||||
std::is_same_v<T, amd_comgr_action_info_t>,
|
||||
bool> = true>
|
||||
template <typename T = comgr_T, std::enable_if_t<std::is_same_v<T, amd_comgr_data_set_t> ||
|
||||
std::is_same_v<T, amd_comgr_action_info_t>,
|
||||
bool> = true>
|
||||
[[nodiscard]] amd_comgr_status_t Create() {
|
||||
if constexpr (std::is_same_v<T, amd_comgr_data_set_t>) {
|
||||
return amd::Comgr::create_data_set(&comgr_obj_);
|
||||
@@ -736,9 +735,9 @@ hipError_t FatBinaryInfo::BuildProgram(const int device_id) {
|
||||
|
||||
// If Program was already built skip this step and return success
|
||||
if (dev_programs_[device_id]->IsProgramBuilt(*g_devices[device_id]->devices()[0]) == false) {
|
||||
if (CL_SUCCESS !=
|
||||
dev_programs_[device_id]->build(g_devices[device_id]->devices(), nullptr, nullptr, nullptr,
|
||||
kOptionChangeable, kNewDevProg)) {
|
||||
if (CL_SUCCESS != dev_programs_[device_id]->build(g_devices[device_id]->devices(), nullptr,
|
||||
nullptr, nullptr, kOptionChangeable,
|
||||
kNewDevProg)) {
|
||||
return hipErrorNoBinaryForGpu;
|
||||
}
|
||||
if (!dev_programs_[device_id]->load()) {
|
||||
|
||||
@@ -581,8 +581,8 @@ bool Graph::RunOneNode(Node node, bool wait) {
|
||||
for (auto edge : node->GetEdges()) {
|
||||
// Don't wait in the nodes, executed on the same streams and if it has just one dependency
|
||||
bool wait = ((i < DEBUG_HIP_FORCE_GRAPH_QUEUES) || (edge->GetDependencies().size() > 1))
|
||||
? true
|
||||
: false;
|
||||
? true
|
||||
: false;
|
||||
// Execute the edge node
|
||||
if (!RunOneNode(edge, wait)) {
|
||||
return false;
|
||||
|
||||
@@ -366,9 +366,8 @@ class GraphNode : public hipGraphNodeDOTAttribute {
|
||||
virtual void EnqueueCommands(hip::Stream* stream) {
|
||||
// If the node is disabled it becomes empty node. To maintain ordering just enqueue marker.
|
||||
// Node can be enabled/disabled only for kernel, memcpy and memset nodes.
|
||||
if (!isEnabled_ &&
|
||||
(type_ == hipGraphNodeTypeKernel || type_ == hipGraphNodeTypeMemcpy ||
|
||||
type_ == hipGraphNodeTypeMemset)) {
|
||||
if (!isEnabled_ && (type_ == hipGraphNodeTypeKernel || type_ == hipGraphNodeTypeMemcpy ||
|
||||
type_ == hipGraphNodeTypeMemset)) {
|
||||
amd::Command::EventWaitList waitList;
|
||||
if (!commands_.empty()) {
|
||||
waitList = commands_[0]->eventWaitList();
|
||||
@@ -1677,7 +1676,7 @@ class GraphMemcpyNode1D : public GraphMemcpyNode {
|
||||
label = buffer;
|
||||
} else {
|
||||
label = std::to_string(GetID()) + "\n" + label_ + "\n(" + memcpyDirection + "," +
|
||||
std::to_string(count_) + ")";
|
||||
std::to_string(count_) + ")";
|
||||
}
|
||||
return label;
|
||||
}
|
||||
@@ -1948,7 +1947,7 @@ class GraphMemsetNode : public GraphNode {
|
||||
sizeBytes = memsetParams_.width * memsetParams_.height * depth_ * memsetParams_.elementSize;
|
||||
}
|
||||
label = std::to_string(GetID()) + "\n" + label_ + "\n(" +
|
||||
std::to_string(memsetParams_.value) + "," + std::to_string(sizeBytes) + ")";
|
||||
std::to_string(memsetParams_.value) + "," + std::to_string(sizeBytes) + ")";
|
||||
}
|
||||
return label;
|
||||
}
|
||||
|
||||
@@ -227,8 +227,8 @@ hipError_t hipStreamAttachMemAsync(hipStream_t stream, void* dev_ptr, size_t len
|
||||
// This type of memory may only be specified if the device associated with the
|
||||
// stream reports a non-zero value for the device attribute hipDevAttrPageableMemoryAccess.
|
||||
hip::Stream* hip_stream = (stream == nullptr || stream == hipStreamLegacy)
|
||||
? hip::getCurrentDevice()->NullStream()
|
||||
: hip::getStream(stream);
|
||||
? hip::getCurrentDevice()->NullStream()
|
||||
: hip::getStream(stream);
|
||||
size_t offset = 0;
|
||||
amd::Memory* memObj = getMemoryObject(dev_ptr, offset);
|
||||
if (memObj == nullptr) {
|
||||
@@ -328,13 +328,13 @@ hipError_t ihipMemPrefetchAsync(const void* dev_ptr, size_t count, hipMemLocatio
|
||||
// Pick the specified stream or Null one from the provided target device
|
||||
if (cpuAccess == true) {
|
||||
hip_stream = (stream == nullptr || stream == hipStreamLegacy)
|
||||
? hip::getCurrentDevice()->NullStream()
|
||||
: hip::getStream(stream);
|
||||
? hip::getCurrentDevice()->NullStream()
|
||||
: hip::getStream(stream);
|
||||
} else {
|
||||
dev = g_devices[targetDevice]->devices()[0];
|
||||
hip_stream = (stream == nullptr || stream == hipStreamLegacy)
|
||||
? g_devices[targetDevice]->NullStream()
|
||||
: hip::getStream(stream);
|
||||
? g_devices[targetDevice]->NullStream()
|
||||
: hip::getStream(stream);
|
||||
}
|
||||
|
||||
if (hip_stream == nullptr) {
|
||||
|
||||
@@ -327,9 +327,9 @@ class Stream : public amd::HostQueue {
|
||||
unsigned long long captureID_;
|
||||
|
||||
static inline CommandQueue::Priority convertToQueuePriority(Priority p) {
|
||||
return p == Priority::High ? amd::CommandQueue::Priority::High
|
||||
: p == Priority::Low ? amd::CommandQueue::Priority::Low
|
||||
: amd::CommandQueue::Priority::Normal;
|
||||
return p == Priority::High ? amd::CommandQueue::Priority::High
|
||||
: p == Priority::Low ? amd::CommandQueue::Priority::Low
|
||||
: amd::CommandQueue::Priority::Normal;
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
@@ -67,8 +67,8 @@ hipMemoryType getMemoryType(const amd::Memory* memory) {
|
||||
}
|
||||
|
||||
return ((CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_USE_HOST_PTR) & memory->getMemFlags())
|
||||
? hipMemoryTypeHost
|
||||
: hipMemoryTypeDevice;
|
||||
? hipMemoryTypeHost
|
||||
: hipMemoryTypeDevice;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
@@ -336,8 +336,8 @@ hipError_t ihipMalloc(void** ptr, size_t sizeBytes, unsigned int flags) {
|
||||
hip::getCurrentDevice()->SetActiveStatus();
|
||||
|
||||
size_t max_device_size = IS_LINUX
|
||||
? dev_info.maxMemAllocSize_
|
||||
: (dev_info.maxMemAllocSize_ + dev_info.maxPhysicalMemAllocSize_);
|
||||
? dev_info.maxMemAllocSize_
|
||||
: (dev_info.maxMemAllocSize_ + dev_info.maxPhysicalMemAllocSize_);
|
||||
|
||||
if ((useHostDevice && dev_info.maxPhysicalMemAllocSize_ < sizeBytes) ||
|
||||
(!useHostDevice && max_device_size < sizeBytes)) {
|
||||
@@ -401,9 +401,8 @@ hipError_t ihipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) {
|
||||
}
|
||||
|
||||
if (flags == 0 ||
|
||||
flags &
|
||||
(hipHostMallocCoherent | hipHostMallocMapped | hipHostMallocNumaUser |
|
||||
hipHostMallocUncached) ||
|
||||
flags & (hipHostMallocCoherent | hipHostMallocMapped | hipHostMallocNumaUser |
|
||||
hipHostMallocUncached) ||
|
||||
(!(flags & hipHostMallocNonCoherent) && HIP_HOST_COHERENT)) {
|
||||
ihipFlags |= CL_MEM_SVM_ATOMICS;
|
||||
}
|
||||
@@ -1143,7 +1142,7 @@ hipError_t ihipArrayCreate(hipArray_t* array, const HIP_ARRAY3D_DESCRIPTOR* pAll
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
unsigned int flags = hipArrayDefault | hipArrayLayered | hipArraySurfaceLoadStore |
|
||||
hipArrayTextureGather; // hipArrayCubemap isn't supported
|
||||
hipArrayTextureGather; // hipArrayCubemap isn't supported
|
||||
if (pAllocateArray->Flags & (~flags)) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
@@ -1282,9 +1281,8 @@ hipError_t hipHostGetFlags(unsigned int* flagsPtr, void* hostPtr) {
|
||||
|
||||
hipError_t ihipHostRegister(void* hostPtr, size_t sizeBytes, unsigned int flags) {
|
||||
if (hostPtr == nullptr || sizeBytes == 0 ||
|
||||
flags &
|
||||
~(hipHostRegisterPortable | hipHostRegisterMapped | hipExtHostRegisterCoarseGrained |
|
||||
hipExtHostRegisterUncached)) {
|
||||
flags & ~(hipHostRegisterPortable | hipHostRegisterMapped | hipExtHostRegisterCoarseGrained |
|
||||
hipExtHostRegisterUncached)) {
|
||||
return hipErrorInvalidValue;
|
||||
} else {
|
||||
unsigned int memFlags = CL_MEM_USE_HOST_PTR | CL_MEM_SVM_ATOMICS;
|
||||
@@ -1377,9 +1375,8 @@ hipError_t hipHostAlloc(void** ptr, size_t sizeBytes, unsigned int flags) {
|
||||
if (ptr == nullptr) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
if (flags &
|
||||
~(hipHostAllocPortable | hipHostAllocMapped | hipHostAllocWriteCombined |
|
||||
hipHostAllocUncached)) {
|
||||
if (flags & ~(hipHostAllocPortable | hipHostAllocMapped | hipHostAllocWriteCombined |
|
||||
hipHostAllocUncached)) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
|
||||
@@ -1868,9 +1865,9 @@ hipError_t ihipMemcpyHtoH(void* dstHost, const void* srcHost, amd::Coord3D copyR
|
||||
for (size_t slice = 0; slice < copyRegion[2]; slice++) {
|
||||
for (size_t row = 0; row < copyRegion[1]; row++) {
|
||||
const void* srcRow = static_cast<const char*>(srcHost) + srcRect.start_ +
|
||||
row * srcRect.rowPitch_ + slice * srcRect.slicePitch_;
|
||||
row * srcRect.rowPitch_ + slice * srcRect.slicePitch_;
|
||||
void* dstRow = static_cast<char*>(dstHost) + dstRect.start_ + row * dstRect.rowPitch_ +
|
||||
slice * dstRect.slicePitch_;
|
||||
slice * dstRect.slicePitch_;
|
||||
std::memcpy(dstRow, srcRow, copyRegion[0]);
|
||||
}
|
||||
}
|
||||
@@ -2331,9 +2328,8 @@ hipError_t ihipMemcpyParam3D(const HIP_MEMCPY3D* pCopy, hipStream_t stream, bool
|
||||
// Transfers from device memory to pageable host memory and transfers from any
|
||||
// host memory to any host memory are synchronous with respect to the host.
|
||||
// Device to Device copies do not need to host side synchronization.
|
||||
if (dstMemoryType == hipMemoryTypeHost ||
|
||||
((pCopy->srcMemoryType == hipMemoryTypeHost) &&
|
||||
(pCopy->dstMemoryType == hipMemoryTypeHost))) {
|
||||
if (dstMemoryType == hipMemoryTypeHost || ((pCopy->srcMemoryType == hipMemoryTypeHost) &&
|
||||
(pCopy->dstMemoryType == hipMemoryTypeHost))) {
|
||||
isAsync = false;
|
||||
} else if ((pCopy->srcMemoryType == hipMemoryTypeDevice) &&
|
||||
(pCopy->dstMemoryType == hipMemoryTypeDevice)) {
|
||||
@@ -4111,7 +4107,7 @@ hipError_t ihipMipmapArrayCreate(hipMipmappedArray_t* mipmapped_array_pptr,
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
unsigned int flags = hipArrayDefault | hipArrayLayered | hipArraySurfaceLoadStore |
|
||||
hipArrayTextureGather; // hipArrayCubemap isn't supported
|
||||
hipArrayTextureGather; // hipArrayCubemap isn't supported
|
||||
if (mipmapped_array_desc_ptr->Flags & (~flags)) {
|
||||
return hipErrorInvalidValue;
|
||||
}
|
||||
|
||||
@@ -380,8 +380,8 @@ hipError_t hipMallocFromPoolAsync(void** dev_ptr, size_t size, hipMemPool_t mem_
|
||||
|
||||
auto mpool = reinterpret_cast<hip::MemoryPool*>(mem_pool);
|
||||
auto hip_stream = (stream == nullptr || stream == hipStreamLegacy)
|
||||
? hip::getCurrentDevice()->NullStream()
|
||||
: reinterpret_cast<hip::Stream*>(stream);
|
||||
? hip::getCurrentDevice()->NullStream()
|
||||
: reinterpret_cast<hip::Stream*>(stream);
|
||||
*dev_ptr = mpool->AllocateMemory(size, hip_stream);
|
||||
if (*dev_ptr == nullptr) {
|
||||
HIP_RETURN(hipErrorOutOfMemory);
|
||||
|
||||
@@ -422,9 +422,9 @@ hipError_t MemoryPool::GetAttribute(hipMemPoolAttr attr, void* value) {
|
||||
break;
|
||||
case hipMemPoolAttrReservedMemCurrent:
|
||||
// All allocated memory by the pool in OS
|
||||
*reinterpret_cast<uint64_t*>(value) = (state_.use_vm_heap_)
|
||||
? MappedSize()
|
||||
: (busy_heap_.GetTotalSize() + free_heap_.GetTotalSize());
|
||||
*reinterpret_cast<uint64_t*>(value) =
|
||||
(state_.use_vm_heap_) ? MappedSize()
|
||||
: (busy_heap_.GetTotalSize() + free_heap_.GetTotalSize());
|
||||
break;
|
||||
case hipMemPoolAttrReservedMemHigh:
|
||||
// High watermark of all allocated memory in OS, since the last reset
|
||||
|
||||
@@ -165,7 +165,7 @@ hipError_t hipFuncGetAttribute(int* value, hipFunction_attribute attrib, hipFunc
|
||||
case HIP_FUNC_ATTRIBUTE_PTX_VERSION:
|
||||
case HIP_FUNC_ATTRIBUTE_BINARY_VERSION:
|
||||
*value = hip::getCurrentDevice()->devices()[0]->isa().versionMajor() * 10 +
|
||||
hip::getCurrentDevice()->devices()[0]->isa().versionMinor();
|
||||
hip::getCurrentDevice()->devices()[0]->isa().versionMinor();
|
||||
break;
|
||||
case HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA:
|
||||
*value = 0;
|
||||
@@ -224,9 +224,8 @@ hipError_t hipFuncSetAttribute(const void* func, hipFuncAttribute attr, int valu
|
||||
(device::Kernel*)(kernel->getDeviceKernel(*(hip::getCurrentDevice()->devices()[0])));
|
||||
|
||||
if (attr == hipFuncAttributeMaxDynamicSharedMemorySize) {
|
||||
if ((value < 0) ||
|
||||
(value > (d_kernel->workGroupInfo()->availableLDSSize_ -
|
||||
d_kernel->workGroupInfo()->localMemSize_))) {
|
||||
if ((value < 0) || (value > (d_kernel->workGroupInfo()->availableLDSSize_ -
|
||||
d_kernel->workGroupInfo()->localMemSize_))) {
|
||||
HIP_RETURN(hipErrorInvalidValue);
|
||||
}
|
||||
d_kernel->workGroupInfo()->maxDynamicSharedSizeBytes_ = value;
|
||||
|
||||
@@ -79,9 +79,8 @@ hipError_t ihipCreateTextureObject(hipTextureObject_t* pTexObject, const hipReso
|
||||
|
||||
// pResViewDesc can only be specified if the type of resource is a HIP array or a HIP mipmapped
|
||||
// array.
|
||||
if ((pResViewDesc != nullptr) &&
|
||||
((pResDesc->resType != hipResourceTypeArray) &&
|
||||
(pResDesc->resType != hipResourceTypeMipmappedArray))) {
|
||||
if ((pResViewDesc != nullptr) && ((pResDesc->resType != hipResourceTypeArray) &&
|
||||
(pResDesc->resType != hipResourceTypeMipmappedArray))) {
|
||||
return hipErrorUnknown;
|
||||
}
|
||||
|
||||
@@ -176,9 +175,8 @@ hipError_t ihipCreateTextureObject(hipTextureObject_t* pTexObject, const hipReso
|
||||
// hipAddressModeWrap and hipAddressModeMirror won't be supported
|
||||
// and will be switched to hipAddressModeClamp.
|
||||
for (int i = 0; i < 3; i++) {
|
||||
if ((pTexDesc->normalizedCoords == 0) &&
|
||||
((pTexDesc->addressMode[i] == hipAddressModeWrap) ||
|
||||
(pTexDesc->addressMode[i] == hipAddressModeMirror))) {
|
||||
if ((pTexDesc->normalizedCoords == 0) && ((pTexDesc->addressMode[i] == hipAddressModeWrap) ||
|
||||
(pTexDesc->addressMode[i] == hipAddressModeMirror))) {
|
||||
addressMode[i] = hip::getCLAddressingMode(hipAddressModeClamp);
|
||||
}
|
||||
// hipTextureDesc::addressMode is ignored if hipResourceDesc::resType is hipResourceTypeLinear
|
||||
@@ -237,12 +235,14 @@ hipError_t ihipCreateTextureObject(hipTextureObject_t* pTexObject, const hipReso
|
||||
if ((pResViewDesc != nullptr) || (readMode == hipReadModeNormalizedFloat) ||
|
||||
(pTexDesc->sRGB == 1)) {
|
||||
// TODO ROCclr currently right now can only change the format of the image.
|
||||
const cl_channel_order channelOrder = (pResViewDesc != nullptr)
|
||||
? hip::getCLChannelOrder(hip::getNumChannels(pResViewDesc->format), pTexDesc->sRGB)
|
||||
: hip::getCLChannelOrder(pResDesc->res.array.array->NumChannels, pTexDesc->sRGB);
|
||||
const cl_channel_type channelType = (pResViewDesc != nullptr)
|
||||
? hip::getCLChannelType(hip::getArrayFormat(pResViewDesc->format), readMode)
|
||||
: hip::getCLChannelType(pResDesc->res.array.array->Format, readMode);
|
||||
const cl_channel_order channelOrder =
|
||||
(pResViewDesc != nullptr)
|
||||
? hip::getCLChannelOrder(hip::getNumChannels(pResViewDesc->format), pTexDesc->sRGB)
|
||||
: hip::getCLChannelOrder(pResDesc->res.array.array->NumChannels, pTexDesc->sRGB);
|
||||
const cl_channel_type channelType =
|
||||
(pResViewDesc != nullptr)
|
||||
? hip::getCLChannelType(hip::getArrayFormat(pResViewDesc->format), readMode)
|
||||
: hip::getCLChannelType(pResDesc->res.array.array->Format, readMode);
|
||||
const amd::Image::Format imageFormat(cl_image_format{channelOrder, channelType});
|
||||
if (!imageFormat.isValid()) {
|
||||
return hipErrorInvalidValue;
|
||||
@@ -277,12 +277,14 @@ hipError_t ihipCreateTextureObject(hipTextureObject_t* pTexObject, const hipReso
|
||||
if ((pResViewDesc != nullptr) || (readMode == hipReadModeNormalizedFloat) ||
|
||||
(pTexDesc->sRGB == 1)) {
|
||||
// TODO ROCclr currently right now can only change the format of the image.
|
||||
const cl_channel_order channelOrder = (pResViewDesc != nullptr)
|
||||
? hip::getCLChannelOrder(hip::getNumChannels(pResViewDesc->format), pTexDesc->sRGB)
|
||||
: hip::getCLChannelOrder(pResDesc->res.mipmap.mipmap->num_channels, pTexDesc->sRGB);
|
||||
const cl_channel_type channelType = (pResViewDesc != nullptr)
|
||||
? hip::getCLChannelType(hip::getArrayFormat(pResViewDesc->format), readMode)
|
||||
: hip::getCLChannelType(pResDesc->res.mipmap.mipmap->format, readMode);
|
||||
const cl_channel_order channelOrder =
|
||||
(pResViewDesc != nullptr)
|
||||
? hip::getCLChannelOrder(hip::getNumChannels(pResViewDesc->format), pTexDesc->sRGB)
|
||||
: hip::getCLChannelOrder(pResDesc->res.mipmap.mipmap->num_channels, pTexDesc->sRGB);
|
||||
const cl_channel_type channelType =
|
||||
(pResViewDesc != nullptr)
|
||||
? hip::getCLChannelType(hip::getArrayFormat(pResViewDesc->format), readMode)
|
||||
: hip::getCLChannelType(pResDesc->res.mipmap.mipmap->format, readMode);
|
||||
const amd::Image::Format imageFormat(cl_image_format{channelOrder, channelType});
|
||||
if (!imageFormat.isValid()) {
|
||||
return hipErrorInvalidValue;
|
||||
@@ -335,7 +337,8 @@ hipError_t ihipCreateTextureObject(hipTextureObject_t* pTexObject, const hipReso
|
||||
hip::getArrayFormat(pResDesc->res.pitch2D.desc), pTexDesc->readMode);
|
||||
const amd::Image::Format imageFormat({channelOrder, channelType});
|
||||
const cl_mem_object_type imageType = hip::getCLMemObjectType(pResDesc->resType);
|
||||
const size_t imageSizeInBytes = pResDesc->res.pitch2D.width * imageFormat.getElementSize() +
|
||||
const size_t imageSizeInBytes =
|
||||
pResDesc->res.pitch2D.width * imageFormat.getElementSize() +
|
||||
pResDesc->res.pitch2D.pitchInBytes * (pResDesc->res.pitch2D.height - 1);
|
||||
amd::Memory* buffer =
|
||||
getMemoryObjectWithOffset(pResDesc->res.pitch2D.devPtr, imageSizeInBytes);
|
||||
|
||||
@@ -36,9 +36,9 @@ int checkContextProperties(const cl_context_properties* properties, bool* offlin
|
||||
|
||||
namespace amd {
|
||||
|
||||
template <typename T>
|
||||
static inline cl_int clGetInfo(T& field, size_t param_value_size, void* param_value,
|
||||
size_t* param_value_size_ret) {
|
||||
template <typename T> static inline cl_int clGetInfo(T& field, size_t param_value_size,
|
||||
void* param_value,
|
||||
size_t* param_value_size_ret) {
|
||||
const void* valuePtr;
|
||||
size_t valueSize;
|
||||
|
||||
|
||||
@@ -164,9 +164,10 @@ RUNTIME_ENTRY(cl_int, clGetEventInfo,
|
||||
}
|
||||
case CL_EVENT_COMMAND_QUEUE: {
|
||||
amd::Command& command = as_amd(event)->command();
|
||||
cl_command_queue queue = command.queue() == NULL
|
||||
? NULL
|
||||
: const_cast<cl_command_queue>(as_cl(command.queue()->asCommandQueue()));
|
||||
cl_command_queue queue =
|
||||
command.queue() == NULL
|
||||
? NULL
|
||||
: const_cast<cl_command_queue>(as_cl(command.queue()->asCommandQueue()));
|
||||
return amd::clGetInfo(queue, param_value_size, param_value, param_value_size_ret);
|
||||
}
|
||||
case CL_EVENT_COMMAND_TYPE: {
|
||||
|
||||
@@ -885,9 +885,8 @@ RUNTIME_ENTRY(cl_int, clGetGLContextInfoKHR,
|
||||
|
||||
for (cl_uint i = 0; i < num_gpu_devices; ++i) {
|
||||
cl_device_id device = gpu_devices[i];
|
||||
if (is_valid(device) &&
|
||||
as_amd(device)->bindExternalDevice(info.flags_, info.hDev_, info.hCtx_,
|
||||
VALIDATE_ONLY)) {
|
||||
if (is_valid(device) && as_amd(device)->bindExternalDevice(info.flags_, info.hDev_,
|
||||
info.hCtx_, VALIDATE_ONLY)) {
|
||||
return amd::clGetInfo(device, param_value_size, param_value, param_value_size_ret);
|
||||
}
|
||||
}
|
||||
@@ -912,9 +911,8 @@ RUNTIME_ENTRY(cl_int, clGetGLContextInfoKHR,
|
||||
|
||||
for (cl_uint i = 0; i < total_devices; ++i) {
|
||||
cl_device_id device = devices[i];
|
||||
if (is_valid(device) &&
|
||||
as_amd(device)->bindExternalDevice(info.flags_, info.hDev_, info.hCtx_,
|
||||
VALIDATE_ONLY)) {
|
||||
if (is_valid(device) && as_amd(device)->bindExternalDevice(info.flags_, info.hDev_,
|
||||
info.hCtx_, VALIDATE_ONLY)) {
|
||||
compatible_devices.push_back(as_amd(device));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -70,12 +70,10 @@ static bool validateFlags(cl_mem_flags flags, bool chkReadWrite = false) {
|
||||
temp |= (flags & CL_MEM_KERNEL_READ_AND_WRITE);
|
||||
}
|
||||
|
||||
if (temp &&
|
||||
!(CL_MEM_READ_WRITE == temp || CL_MEM_WRITE_ONLY == temp ||
|
||||
(chkReadWrite &&
|
||||
(CL_MEM_KERNEL_READ_AND_WRITE == temp ||
|
||||
(CL_MEM_KERNEL_READ_AND_WRITE | CL_MEM_READ_WRITE) == temp)) ||
|
||||
CL_MEM_READ_ONLY == temp)) {
|
||||
if (temp && !(CL_MEM_READ_WRITE == temp || CL_MEM_WRITE_ONLY == temp ||
|
||||
(chkReadWrite && (CL_MEM_KERNEL_READ_AND_WRITE == temp ||
|
||||
(CL_MEM_KERNEL_READ_AND_WRITE | CL_MEM_READ_WRITE) == temp)) ||
|
||||
CL_MEM_READ_ONLY == temp)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -89,9 +87,8 @@ static bool validateFlags(cl_mem_flags flags, bool chkReadWrite = false) {
|
||||
}
|
||||
|
||||
if ((flags & CL_MEM_EXTERNAL_PHYSICAL_AMD) &&
|
||||
(flags &
|
||||
(CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE |
|
||||
CL_MEM_READ_ONLY))) {
|
||||
(flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR | CL_MEM_ALLOC_HOST_PTR |
|
||||
CL_MEM_READ_WRITE | CL_MEM_READ_ONLY))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -414,9 +411,8 @@ RUNTIME_ENTRY_RET(cl_mem, clCreateBuffer,
|
||||
|
||||
// check extensions flag consistency
|
||||
if ((flags & CL_MEM_USE_PERSISTENT_MEM_AMD) &&
|
||||
(flags &
|
||||
(CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_EXTERNAL_PHYSICAL_AMD |
|
||||
CL_MEM_BUS_ADDRESSABLE_AMD))) {
|
||||
(flags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_EXTERNAL_PHYSICAL_AMD |
|
||||
CL_MEM_BUS_ADDRESSABLE_AMD))) {
|
||||
*not_null(errcode_ret) = CL_INVALID_VALUE;
|
||||
LogWarning("conflicting flags CL_MEM_USE_PERSISTENT_MEM_AMD and host memory specific flags");
|
||||
return (cl_mem)0;
|
||||
@@ -901,9 +897,8 @@ RUNTIME_ENTRY(cl_int, clEnqueueCopyBuffer,
|
||||
return CL_INVALID_VALUE;
|
||||
}
|
||||
|
||||
if (srcBuffer == dstBuffer &&
|
||||
((src_offset <= dst_offset && dst_offset < src_offset + cb) ||
|
||||
(dst_offset <= src_offset && src_offset < dst_offset + cb))) {
|
||||
if (srcBuffer == dstBuffer && ((src_offset <= dst_offset && dst_offset < src_offset + cb) ||
|
||||
(dst_offset <= src_offset && src_offset < dst_offset + cb))) {
|
||||
return CL_MEM_COPY_OVERLAP;
|
||||
}
|
||||
|
||||
|
||||
@@ -60,9 +60,8 @@ RUNTIME_ENTRY(cl_int, clEnqueueCopyBufferP2PAMD,
|
||||
return CL_INVALID_VALUE;
|
||||
}
|
||||
|
||||
if (srcBuffer == dstBuffer &&
|
||||
((src_offset <= dst_offset && dst_offset < src_offset + cb) ||
|
||||
(dst_offset <= src_offset && src_offset < dst_offset + cb))) {
|
||||
if (srcBuffer == dstBuffer && ((src_offset <= dst_offset && dst_offset < src_offset + cb) ||
|
||||
(dst_offset <= src_offset && src_offset < dst_offset + cb))) {
|
||||
return CL_MEM_COPY_OVERLAP;
|
||||
}
|
||||
|
||||
|
||||
@@ -1833,7 +1833,7 @@ RUNTIME_ENTRY(cl_int, clGetKernelWorkGroupInfo,
|
||||
// Return the amount of used local memory
|
||||
const size_t align = amdDevice.info().minDataTypeAlignSize_;
|
||||
cl_ulong memSize = as_amd(kernel)->parameters().localMemSize(align) +
|
||||
amd::alignUp(devKernel->workGroupInfo()->localMemSize_, align);
|
||||
amd::alignUp(devKernel->workGroupInfo()->localMemSize_, align);
|
||||
return amd::clGetInfo(memSize, param_value_size, param_value, param_value_size_ret);
|
||||
}
|
||||
case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: {
|
||||
|
||||
@@ -1021,9 +1021,9 @@ inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, long)
|
||||
* does not work, because when using a derived type (e.g. Context) the generic
|
||||
* template will provide a better match.
|
||||
*/
|
||||
template <typename Func, typename T>
|
||||
inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, int,
|
||||
typename T::cl_type = 0) {
|
||||
template <typename Func, typename T> inline cl_int getInfoHelper(Func f, cl_uint name,
|
||||
VECTOR_CLASS<T>* param, int,
|
||||
typename T::cl_type = 0) {
|
||||
::size_t required;
|
||||
cl_int err = f(name, 0, NULL, &required);
|
||||
if (err != CL_SUCCESS) {
|
||||
@@ -2743,12 +2743,10 @@ template <typename IteratorType>
|
||||
cl_int copy(IteratorType startIterator, IteratorType endIterator, cl::Buffer& buffer);
|
||||
template <typename IteratorType>
|
||||
cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, IteratorType endIterator);
|
||||
template <typename IteratorType>
|
||||
cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
|
||||
cl::Buffer& buffer);
|
||||
template <typename IteratorType>
|
||||
cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer, IteratorType startIterator,
|
||||
IteratorType endIterator);
|
||||
template <typename IteratorType> cl_int copy(const CommandQueue& queue, IteratorType startIterator,
|
||||
IteratorType endIterator, cl::Buffer& buffer);
|
||||
template <typename IteratorType> cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer,
|
||||
IteratorType startIterator, IteratorType endIterator);
|
||||
|
||||
|
||||
/*! \brief Class interface for Buffer Memory Objects.
|
||||
@@ -2804,9 +2802,9 @@ class Buffer : public Memory {
|
||||
* IteratorType must be random access.
|
||||
* If useHostPtr is specified iterators must represent contiguous data.
|
||||
*/
|
||||
template <typename IteratorType>
|
||||
Buffer(IteratorType startIterator, IteratorType endIterator, bool readOnly,
|
||||
bool useHostPtr = false, cl_int* err = NULL) {
|
||||
template <typename IteratorType> Buffer(IteratorType startIterator, IteratorType endIterator,
|
||||
bool readOnly, bool useHostPtr = false,
|
||||
cl_int* err = NULL) {
|
||||
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
|
||||
cl_int error;
|
||||
|
||||
@@ -2850,17 +2848,17 @@ class Buffer : public Memory {
|
||||
* IteratorType must be random access.
|
||||
* If useHostPtr is specified iterators must represent contiguous data.
|
||||
*/
|
||||
template <typename IteratorType>
|
||||
Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator,
|
||||
bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
|
||||
template <typename IteratorType> Buffer(const Context& context, IteratorType startIterator,
|
||||
IteratorType endIterator, bool readOnly,
|
||||
bool useHostPtr = false, cl_int* err = NULL);
|
||||
|
||||
/*!
|
||||
* \brief Construct a Buffer from a host container via iterators using a specified queue.
|
||||
* If useHostPtr is specified iterators must represent contiguous data.
|
||||
*/
|
||||
template <typename IteratorType>
|
||||
Buffer(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
|
||||
bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
|
||||
template <typename IteratorType> Buffer(const CommandQueue& queue, IteratorType startIterator,
|
||||
IteratorType endIterator, bool readOnly,
|
||||
bool useHostPtr = false, cl_int* err = NULL);
|
||||
|
||||
//! \brief Default constructor - initializes to NULL.
|
||||
Buffer() : Memory() {}
|
||||
@@ -5321,8 +5319,8 @@ class CommandQueue : public detail::Wrapper<cl_command_queue> {
|
||||
const VECTOR_CLASS<const void*>* mem_locs = NULL,
|
||||
const VECTOR_CLASS<Event>* events = NULL, Event* event = NULL) const {
|
||||
cl_mem* mems = (mem_objects != NULL && mem_objects->size() > 0)
|
||||
? (cl_mem*)alloca(mem_objects->size() * sizeof(cl_mem))
|
||||
: NULL;
|
||||
? (cl_mem*)alloca(mem_objects->size() * sizeof(cl_mem))
|
||||
: NULL;
|
||||
|
||||
if (mems != NULL) {
|
||||
for (unsigned int i = 0; i < mem_objects->size(); i++) {
|
||||
@@ -5512,9 +5510,9 @@ __attribute__((weak)) CommandQueue CommandQueue::default_;
|
||||
__attribute__((weak)) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
|
||||
#endif // !_WIN32
|
||||
|
||||
template <typename IteratorType>
|
||||
Buffer::Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator,
|
||||
bool readOnly, bool useHostPtr, cl_int* err) {
|
||||
template <typename IteratorType> Buffer::Buffer(const Context& context, IteratorType startIterator,
|
||||
IteratorType endIterator, bool readOnly,
|
||||
bool useHostPtr, cl_int* err) {
|
||||
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
|
||||
cl_int error;
|
||||
|
||||
@@ -5716,9 +5714,9 @@ inline cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, Iterato
|
||||
* Host to Device.
|
||||
* Uses specified queue.
|
||||
*/
|
||||
template <typename IteratorType>
|
||||
inline cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
|
||||
cl::Buffer& buffer) {
|
||||
template <typename IteratorType> inline cl_int copy(const CommandQueue& queue,
|
||||
IteratorType startIterator,
|
||||
IteratorType endIterator, cl::Buffer& buffer) {
|
||||
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
|
||||
cl_int error;
|
||||
|
||||
|
||||
@@ -59,7 +59,7 @@ extern "C" {
|
||||
#define CL_EXT_SUFFIX__VERSION_1_1 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
|
||||
#define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED \
|
||||
CL_EXTENSION_WEAK_LINK \
|
||||
AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7
|
||||
AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7
|
||||
|
||||
#ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
|
||||
#define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
|
||||
@@ -68,7 +68,7 @@ extern "C" {
|
||||
#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
|
||||
#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED \
|
||||
CL_EXTENSION_WEAK_LINK \
|
||||
AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8
|
||||
AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8
|
||||
#else
|
||||
#warning This path should never happen outside of internal operating system development. AvailabilityMacros do not function correctly here!
|
||||
#define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
|
||||
|
||||
@@ -1021,9 +1021,9 @@ inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, long)
|
||||
* does not work, because when using a derived type (e.g. Context) the generic
|
||||
* template will provide a better match.
|
||||
*/
|
||||
template <typename Func, typename T>
|
||||
inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, int,
|
||||
typename T::cl_type = 0) {
|
||||
template <typename Func, typename T> inline cl_int getInfoHelper(Func f, cl_uint name,
|
||||
VECTOR_CLASS<T>* param, int,
|
||||
typename T::cl_type = 0) {
|
||||
::size_t required;
|
||||
cl_int err = f(name, 0, NULL, &required);
|
||||
if (err != CL_SUCCESS) {
|
||||
@@ -2743,12 +2743,10 @@ template <typename IteratorType>
|
||||
cl_int copy(IteratorType startIterator, IteratorType endIterator, cl::Buffer& buffer);
|
||||
template <typename IteratorType>
|
||||
cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, IteratorType endIterator);
|
||||
template <typename IteratorType>
|
||||
cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
|
||||
cl::Buffer& buffer);
|
||||
template <typename IteratorType>
|
||||
cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer, IteratorType startIterator,
|
||||
IteratorType endIterator);
|
||||
template <typename IteratorType> cl_int copy(const CommandQueue& queue, IteratorType startIterator,
|
||||
IteratorType endIterator, cl::Buffer& buffer);
|
||||
template <typename IteratorType> cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer,
|
||||
IteratorType startIterator, IteratorType endIterator);
|
||||
|
||||
|
||||
/*! \brief Class interface for Buffer Memory Objects.
|
||||
@@ -2804,9 +2802,9 @@ class Buffer : public Memory {
|
||||
* IteratorType must be random access.
|
||||
* If useHostPtr is specified iterators must represent contiguous data.
|
||||
*/
|
||||
template <typename IteratorType>
|
||||
Buffer(IteratorType startIterator, IteratorType endIterator, bool readOnly,
|
||||
bool useHostPtr = false, cl_int* err = NULL) {
|
||||
template <typename IteratorType> Buffer(IteratorType startIterator, IteratorType endIterator,
|
||||
bool readOnly, bool useHostPtr = false,
|
||||
cl_int* err = NULL) {
|
||||
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
|
||||
cl_int error;
|
||||
|
||||
@@ -2850,17 +2848,17 @@ class Buffer : public Memory {
|
||||
* IteratorType must be random access.
|
||||
* If useHostPtr is specified iterators must represent contiguous data.
|
||||
*/
|
||||
template <typename IteratorType>
|
||||
Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator,
|
||||
bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
|
||||
template <typename IteratorType> Buffer(const Context& context, IteratorType startIterator,
|
||||
IteratorType endIterator, bool readOnly,
|
||||
bool useHostPtr = false, cl_int* err = NULL);
|
||||
|
||||
/*!
|
||||
* \brief Construct a Buffer from a host container via iterators using a specified queue.
|
||||
* If useHostPtr is specified iterators must represent contiguous data.
|
||||
*/
|
||||
template <typename IteratorType>
|
||||
Buffer(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
|
||||
bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
|
||||
template <typename IteratorType> Buffer(const CommandQueue& queue, IteratorType startIterator,
|
||||
IteratorType endIterator, bool readOnly,
|
||||
bool useHostPtr = false, cl_int* err = NULL);
|
||||
|
||||
//! \brief Default constructor - initializes to NULL.
|
||||
Buffer() : Memory() {}
|
||||
@@ -5321,8 +5319,8 @@ class CommandQueue : public detail::Wrapper<cl_command_queue> {
|
||||
const VECTOR_CLASS<const void*>* mem_locs = NULL,
|
||||
const VECTOR_CLASS<Event>* events = NULL, Event* event = NULL) const {
|
||||
cl_mem* mems = (mem_objects != NULL && mem_objects->size() > 0)
|
||||
? (cl_mem*)alloca(mem_objects->size() * sizeof(cl_mem))
|
||||
: NULL;
|
||||
? (cl_mem*)alloca(mem_objects->size() * sizeof(cl_mem))
|
||||
: NULL;
|
||||
|
||||
if (mems != NULL) {
|
||||
for (unsigned int i = 0; i < mem_objects->size(); i++) {
|
||||
@@ -5512,9 +5510,9 @@ __attribute__((weak)) CommandQueue CommandQueue::default_;
|
||||
__attribute__((weak)) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
|
||||
#endif // !_WIN32
|
||||
|
||||
template <typename IteratorType>
|
||||
Buffer::Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator,
|
||||
bool readOnly, bool useHostPtr, cl_int* err) {
|
||||
template <typename IteratorType> Buffer::Buffer(const Context& context, IteratorType startIterator,
|
||||
IteratorType endIterator, bool readOnly,
|
||||
bool useHostPtr, cl_int* err) {
|
||||
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
|
||||
cl_int error;
|
||||
|
||||
@@ -5716,9 +5714,9 @@ inline cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, Iterato
|
||||
* Host to Device.
|
||||
* Uses specified queue.
|
||||
*/
|
||||
template <typename IteratorType>
|
||||
inline cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
|
||||
cl::Buffer& buffer) {
|
||||
template <typename IteratorType> inline cl_int copy(const CommandQueue& queue,
|
||||
IteratorType startIterator,
|
||||
IteratorType endIterator, cl::Buffer& buffer) {
|
||||
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
|
||||
cl_int error;
|
||||
|
||||
|
||||
@@ -1765,9 +1765,8 @@ template <typename T> inline bool operator!=(const Wrapper<T>& lhs, const Wrappe
|
||||
|
||||
|
||||
using BuildLogType =
|
||||
vector<std::pair<cl::Device,
|
||||
typename detail::param_traits<detail::cl_program_build_info,
|
||||
CL_PROGRAM_BUILD_LOG>::param_type>>;
|
||||
vector<std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info,
|
||||
CL_PROGRAM_BUILD_LOG>::param_type>>;
|
||||
#if defined(CL_HPP_ENABLE_EXCEPTIONS)
|
||||
/**
|
||||
* Exception class for build errors to carry build info
|
||||
@@ -2961,12 +2960,10 @@ template <typename IteratorType>
|
||||
cl_int copy(IteratorType startIterator, IteratorType endIterator, cl::Buffer& buffer);
|
||||
template <typename IteratorType>
|
||||
cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, IteratorType endIterator);
|
||||
template <typename IteratorType>
|
||||
cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
|
||||
cl::Buffer& buffer);
|
||||
template <typename IteratorType>
|
||||
cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer, IteratorType startIterator,
|
||||
IteratorType endIterator);
|
||||
template <typename IteratorType> cl_int copy(const CommandQueue& queue, IteratorType startIterator,
|
||||
IteratorType endIterator, cl::Buffer& buffer);
|
||||
template <typename IteratorType> cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer,
|
||||
IteratorType startIterator, IteratorType endIterator);
|
||||
|
||||
|
||||
#if CL_HPP_TARGET_OPENCL_VERSION >= 200
|
||||
@@ -3053,8 +3050,8 @@ template <typename T, class SVMTrait> class SVMAllocator {
|
||||
|
||||
SVMAllocator(const SVMAllocator& other) : context_(other.context_) {}
|
||||
|
||||
template <typename U>
|
||||
SVMAllocator(const SVMAllocator<U, SVMTrait>& other) : context_(other.context_) {}
|
||||
template <typename U> SVMAllocator(const SVMAllocator<U, SVMTrait>& other)
|
||||
: context_(other.context_) {}
|
||||
|
||||
~SVMAllocator() {}
|
||||
|
||||
@@ -3272,9 +3269,9 @@ class Buffer : public Memory {
|
||||
* IteratorType must be random access.
|
||||
* If useHostPtr is specified iterators must represent contiguous data.
|
||||
*/
|
||||
template <typename IteratorType>
|
||||
Buffer(IteratorType startIterator, IteratorType endIterator, bool readOnly,
|
||||
bool useHostPtr = false, cl_int* err = NULL) {
|
||||
template <typename IteratorType> Buffer(IteratorType startIterator, IteratorType endIterator,
|
||||
bool readOnly, bool useHostPtr = false,
|
||||
cl_int* err = NULL) {
|
||||
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
|
||||
cl_int error;
|
||||
|
||||
@@ -3318,17 +3315,17 @@ class Buffer : public Memory {
|
||||
* IteratorType must be random access.
|
||||
* If useHostPtr is specified iterators must represent contiguous data.
|
||||
*/
|
||||
template <typename IteratorType>
|
||||
Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator,
|
||||
bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
|
||||
template <typename IteratorType> Buffer(const Context& context, IteratorType startIterator,
|
||||
IteratorType endIterator, bool readOnly,
|
||||
bool useHostPtr = false, cl_int* err = NULL);
|
||||
|
||||
/*!
|
||||
* \brief Construct a Buffer from a host container via iterators using a specified queue.
|
||||
* If useHostPtr is specified iterators must be random access.
|
||||
*/
|
||||
template <typename IteratorType>
|
||||
Buffer(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
|
||||
bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
|
||||
template <typename IteratorType> Buffer(const CommandQueue& queue, IteratorType startIterator,
|
||||
IteratorType endIterator, bool readOnly,
|
||||
bool useHostPtr = false, cl_int* err = NULL);
|
||||
|
||||
//! \brief Default constructor - initializes to NULL.
|
||||
Buffer() : Memory() {}
|
||||
@@ -4828,8 +4825,7 @@ template <typename T, class Enable = void> struct KernelArgumentHandler;
|
||||
|
||||
// Enable for objects that are not subclasses of memory
|
||||
// Pointers, constants etc
|
||||
template <typename T>
|
||||
struct KernelArgumentHandler<
|
||||
template <typename T> struct KernelArgumentHandler<
|
||||
T, typename std::enable_if<!std::is_base_of<cl::Memory, T>::value>::type> {
|
||||
static size_type size(const T&) { return sizeof(T); }
|
||||
static const T* ptr(const T& value) { return &value; }
|
||||
@@ -4992,9 +4988,8 @@ class Kernel : public detail::Wrapper<cl_kernel> {
|
||||
__GET_KERNEL_ARG_INFO_ERR);
|
||||
}
|
||||
|
||||
template <cl_int name>
|
||||
size_type getSubGroupInfo(const cl::Device& dev, const cl::NDRange& range,
|
||||
cl_int* err = NULL) const {
|
||||
template <cl_int name> size_type getSubGroupInfo(const cl::Device& dev, const cl::NDRange& range,
|
||||
cl_int* err = NULL) const {
|
||||
size_type param;
|
||||
cl_int result = getSubGroupInfo(dev, name, range, ¶m);
|
||||
if (err != NULL) {
|
||||
@@ -5591,9 +5586,8 @@ inline Program linkProgram(vector<Program> inputPrograms, const char* options =
|
||||
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
|
||||
|
||||
// Template specialization for CL_PROGRAM_BINARIES
|
||||
template <>
|
||||
inline cl_int cl::Program::getInfo(cl_program_info name,
|
||||
vector<vector<unsigned char>>* param) const {
|
||||
template <> inline cl_int cl::Program::getInfo(cl_program_info name,
|
||||
vector<vector<unsigned char>>* param) const {
|
||||
if (name != CL_PROGRAM_BINARIES) {
|
||||
return CL_INVALID_VALUE;
|
||||
}
|
||||
@@ -6367,9 +6361,9 @@ class CommandQueue : public detail::Wrapper<cl_command_queue> {
|
||||
* Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer.
|
||||
* This variant takes a raw SVM pointer.
|
||||
*/
|
||||
template <typename T>
|
||||
cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags, size_type size,
|
||||
const vector<Event>* events = NULL, Event* event = NULL) const {
|
||||
template <typename T> cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags,
|
||||
size_type size, const vector<Event>* events = NULL,
|
||||
Event* event = NULL) const {
|
||||
cl_event tmp;
|
||||
cl_int err = detail::errHandler(
|
||||
::clEnqueueSVMMap(
|
||||
@@ -6468,9 +6462,9 @@ class CommandQueue : public detail::Wrapper<cl_command_queue> {
|
||||
* Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime.
|
||||
* This variant takes a cl::pointer instance.
|
||||
*/
|
||||
template <typename T, class D>
|
||||
cl_int enqueueUnmapSVM(cl::pointer<T, D>& ptr, const vector<Event>* events = NULL,
|
||||
Event* event = NULL) const {
|
||||
template <typename T, class D> cl_int enqueueUnmapSVM(cl::pointer<T, D>& ptr,
|
||||
const vector<Event>* events = NULL,
|
||||
Event* event = NULL) const {
|
||||
cl_event tmp;
|
||||
cl_int err = detail::errHandler(
|
||||
::clEnqueueSVMUnmap(
|
||||
@@ -6488,9 +6482,9 @@ class CommandQueue : public detail::Wrapper<cl_command_queue> {
|
||||
* Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime.
|
||||
* This variant takes a cl::vector instance.
|
||||
*/
|
||||
template <typename T, class Alloc>
|
||||
cl_int enqueueUnmapSVM(cl::vector<T, Alloc>& container, const vector<Event>* events = NULL,
|
||||
Event* event = NULL) const {
|
||||
template <typename T, class Alloc> cl_int enqueueUnmapSVM(cl::vector<T, Alloc>& container,
|
||||
const vector<Event>* events = NULL,
|
||||
Event* event = NULL) const {
|
||||
cl_event tmp;
|
||||
cl_int err = detail::errHandler(
|
||||
::clEnqueueSVMUnmap(
|
||||
@@ -6827,8 +6821,9 @@ class DeviceCommandQueue : public detail::Wrapper<cl_command_queue> {
|
||||
cl::Context context = cl::Context::getDefault();
|
||||
cl::Device device = cl::Device::getDefault();
|
||||
|
||||
cl_command_queue_properties mergedProperties = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
|
||||
CL_QUEUE_ON_DEVICE | static_cast<cl_command_queue_properties>(properties);
|
||||
cl_command_queue_properties mergedProperties =
|
||||
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE |
|
||||
static_cast<cl_command_queue_properties>(properties);
|
||||
|
||||
cl_queue_properties queue_properties[] = {CL_QUEUE_PROPERTIES, mergedProperties, 0};
|
||||
object_ = ::clCreateCommandQueueWithProperties(context(), device(), queue_properties, &error);
|
||||
@@ -6847,8 +6842,9 @@ class DeviceCommandQueue : public detail::Wrapper<cl_command_queue> {
|
||||
cl_int* err = NULL) {
|
||||
cl_int error;
|
||||
|
||||
cl_command_queue_properties mergedProperties = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
|
||||
CL_QUEUE_ON_DEVICE | static_cast<cl_command_queue_properties>(properties);
|
||||
cl_command_queue_properties mergedProperties =
|
||||
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE |
|
||||
static_cast<cl_command_queue_properties>(properties);
|
||||
cl_queue_properties queue_properties[] = {CL_QUEUE_PROPERTIES, mergedProperties, 0};
|
||||
object_ = ::clCreateCommandQueueWithProperties(context(), device(), queue_properties, &error);
|
||||
|
||||
@@ -6866,8 +6862,9 @@ class DeviceCommandQueue : public detail::Wrapper<cl_command_queue> {
|
||||
cl_int* err = NULL) {
|
||||
cl_int error;
|
||||
|
||||
cl_command_queue_properties mergedProperties = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
|
||||
CL_QUEUE_ON_DEVICE | static_cast<cl_command_queue_properties>(properties);
|
||||
cl_command_queue_properties mergedProperties =
|
||||
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE |
|
||||
static_cast<cl_command_queue_properties>(properties);
|
||||
cl_queue_properties queue_properties[] = {CL_QUEUE_PROPERTIES, mergedProperties, CL_QUEUE_SIZE,
|
||||
queueSize, 0};
|
||||
object_ = ::clCreateCommandQueueWithProperties(context(), device(), queue_properties, &error);
|
||||
@@ -7021,9 +7018,9 @@ template <> struct KernelArgumentHandler<cl::DeviceCommandQueue, void> {
|
||||
#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
|
||||
|
||||
|
||||
template <typename IteratorType>
|
||||
Buffer::Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator,
|
||||
bool readOnly, bool useHostPtr, cl_int* err) {
|
||||
template <typename IteratorType> Buffer::Buffer(const Context& context, IteratorType startIterator,
|
||||
IteratorType endIterator, bool readOnly,
|
||||
bool useHostPtr, cl_int* err) {
|
||||
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
|
||||
cl_int error;
|
||||
|
||||
@@ -7163,9 +7160,9 @@ inline void* enqueueMapBuffer(const Buffer& buffer, cl_bool blocking, cl_map_fla
|
||||
* update a region of a coarse-grained SVM buffer.
|
||||
* This variant takes a raw SVM pointer.
|
||||
*/
|
||||
template <typename T>
|
||||
inline cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags, size_type size,
|
||||
const vector<Event>* events, Event* event) {
|
||||
template <typename T> inline cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags,
|
||||
size_type size, const vector<Event>* events,
|
||||
Event* event) {
|
||||
cl_int error;
|
||||
CommandQueue queue = CommandQueue::getDefault(&error);
|
||||
if (error != CL_SUCCESS) {
|
||||
@@ -7180,10 +7177,10 @@ inline cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags, size_t
|
||||
* update a region of a coarse-grained SVM buffer.
|
||||
* This variant takes a cl::pointer instance.
|
||||
*/
|
||||
template <typename T, class D>
|
||||
inline cl_int enqueueMapSVM(cl::pointer<T, D> ptr, cl_bool blocking, cl_map_flags flags,
|
||||
size_type size, const vector<Event>* events = NULL,
|
||||
Event* event = NULL) {
|
||||
template <typename T, class D> inline cl_int enqueueMapSVM(cl::pointer<T, D> ptr, cl_bool blocking,
|
||||
cl_map_flags flags, size_type size,
|
||||
const vector<Event>* events = NULL,
|
||||
Event* event = NULL) {
|
||||
cl_int error;
|
||||
CommandQueue queue = CommandQueue::getDefault(&error);
|
||||
if (error != CL_SUCCESS) {
|
||||
@@ -7257,9 +7254,9 @@ inline cl_int enqueueUnmapSVM(T* ptr, const vector<Event>* events = NULL, Event*
|
||||
* SVM buffer back to the OpenCL runtime.
|
||||
* This variant takes a cl::pointer instance.
|
||||
*/
|
||||
template <typename T, class D>
|
||||
inline cl_int enqueueUnmapSVM(cl::pointer<T, D>& ptr, const vector<Event>* events = NULL,
|
||||
Event* event = NULL) {
|
||||
template <typename T, class D> inline cl_int enqueueUnmapSVM(cl::pointer<T, D>& ptr,
|
||||
const vector<Event>* events = NULL,
|
||||
Event* event = NULL) {
|
||||
cl_int error;
|
||||
CommandQueue queue = CommandQueue::getDefault(&error);
|
||||
if (error != CL_SUCCESS) {
|
||||
@@ -7275,9 +7272,9 @@ inline cl_int enqueueUnmapSVM(cl::pointer<T, D>& ptr, const vector<Event>* event
|
||||
* SVM buffer back to the OpenCL runtime.
|
||||
* This variant takes a cl::vector instance.
|
||||
*/
|
||||
template <typename T, class Alloc>
|
||||
inline cl_int enqueueUnmapSVM(cl::vector<T, Alloc>& container, const vector<Event>* events = NULL,
|
||||
Event* event = NULL) {
|
||||
template <typename T, class Alloc> inline cl_int enqueueUnmapSVM(cl::vector<T, Alloc>& container,
|
||||
const vector<Event>* events = NULL,
|
||||
Event* event = NULL) {
|
||||
cl_int error;
|
||||
CommandQueue queue = CommandQueue::getDefault(&error);
|
||||
if (error != CL_SUCCESS) {
|
||||
@@ -7336,9 +7333,9 @@ inline cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, Iterato
|
||||
* Host to Device.
|
||||
* Uses specified queue.
|
||||
*/
|
||||
template <typename IteratorType>
|
||||
inline cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
|
||||
cl::Buffer& buffer) {
|
||||
template <typename IteratorType> inline cl_int copy(const CommandQueue& queue,
|
||||
IteratorType startIterator,
|
||||
IteratorType endIterator, cl::Buffer& buffer) {
|
||||
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
|
||||
cl_int error;
|
||||
|
||||
|
||||
@@ -67,7 +67,7 @@ extern "C" {
|
||||
#define CL_EXT_SUFFIX__VERSION_1_1 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
|
||||
#define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED \
|
||||
CL_EXTENSION_WEAK_LINK \
|
||||
AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7
|
||||
AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7
|
||||
|
||||
#ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
|
||||
#define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
|
||||
@@ -76,7 +76,7 @@ extern "C" {
|
||||
#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
|
||||
#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED \
|
||||
CL_EXTENSION_WEAK_LINK \
|
||||
AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8
|
||||
AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8
|
||||
#else
|
||||
#warning This path should never happen outside of internal operating system development. AvailabilityMacros do not function correctly here!
|
||||
#define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
|
||||
|
||||
@@ -1021,9 +1021,9 @@ inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, long)
|
||||
* does not work, because when using a derived type (e.g. Context) the generic
|
||||
* template will provide a better match.
|
||||
*/
|
||||
template <typename Func, typename T>
|
||||
inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, int,
|
||||
typename T::cl_type = 0) {
|
||||
template <typename Func, typename T> inline cl_int getInfoHelper(Func f, cl_uint name,
|
||||
VECTOR_CLASS<T>* param, int,
|
||||
typename T::cl_type = 0) {
|
||||
::size_t required;
|
||||
cl_int err = f(name, 0, NULL, &required);
|
||||
if (err != CL_SUCCESS) {
|
||||
@@ -2743,12 +2743,10 @@ template <typename IteratorType>
|
||||
cl_int copy(IteratorType startIterator, IteratorType endIterator, cl::Buffer& buffer);
|
||||
template <typename IteratorType>
|
||||
cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, IteratorType endIterator);
|
||||
template <typename IteratorType>
|
||||
cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
|
||||
cl::Buffer& buffer);
|
||||
template <typename IteratorType>
|
||||
cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer, IteratorType startIterator,
|
||||
IteratorType endIterator);
|
||||
template <typename IteratorType> cl_int copy(const CommandQueue& queue, IteratorType startIterator,
|
||||
IteratorType endIterator, cl::Buffer& buffer);
|
||||
template <typename IteratorType> cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer,
|
||||
IteratorType startIterator, IteratorType endIterator);
|
||||
|
||||
|
||||
/*! \brief Class interface for Buffer Memory Objects.
|
||||
@@ -2804,9 +2802,9 @@ class Buffer : public Memory {
|
||||
* IteratorType must be random access.
|
||||
* If useHostPtr is specified iterators must represent contiguous data.
|
||||
*/
|
||||
template <typename IteratorType>
|
||||
Buffer(IteratorType startIterator, IteratorType endIterator, bool readOnly,
|
||||
bool useHostPtr = false, cl_int* err = NULL) {
|
||||
template <typename IteratorType> Buffer(IteratorType startIterator, IteratorType endIterator,
|
||||
bool readOnly, bool useHostPtr = false,
|
||||
cl_int* err = NULL) {
|
||||
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
|
||||
cl_int error;
|
||||
|
||||
@@ -2850,17 +2848,17 @@ class Buffer : public Memory {
|
||||
* IteratorType must be random access.
|
||||
* If useHostPtr is specified iterators must represent contiguous data.
|
||||
*/
|
||||
template <typename IteratorType>
|
||||
Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator,
|
||||
bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
|
||||
template <typename IteratorType> Buffer(const Context& context, IteratorType startIterator,
|
||||
IteratorType endIterator, bool readOnly,
|
||||
bool useHostPtr = false, cl_int* err = NULL);
|
||||
|
||||
/*!
|
||||
* \brief Construct a Buffer from a host container via iterators using a specified queue.
|
||||
* If useHostPtr is specified iterators must represent contiguous data.
|
||||
*/
|
||||
template <typename IteratorType>
|
||||
Buffer(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
|
||||
bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
|
||||
template <typename IteratorType> Buffer(const CommandQueue& queue, IteratorType startIterator,
|
||||
IteratorType endIterator, bool readOnly,
|
||||
bool useHostPtr = false, cl_int* err = NULL);
|
||||
|
||||
//! \brief Default constructor - initializes to NULL.
|
||||
Buffer() : Memory() {}
|
||||
@@ -5321,8 +5319,8 @@ class CommandQueue : public detail::Wrapper<cl_command_queue> {
|
||||
const VECTOR_CLASS<const void*>* mem_locs = NULL,
|
||||
const VECTOR_CLASS<Event>* events = NULL, Event* event = NULL) const {
|
||||
cl_mem* mems = (mem_objects != NULL && mem_objects->size() > 0)
|
||||
? (cl_mem*)alloca(mem_objects->size() * sizeof(cl_mem))
|
||||
: NULL;
|
||||
? (cl_mem*)alloca(mem_objects->size() * sizeof(cl_mem))
|
||||
: NULL;
|
||||
|
||||
if (mems != NULL) {
|
||||
for (unsigned int i = 0; i < mem_objects->size(); i++) {
|
||||
@@ -5512,9 +5510,9 @@ __attribute__((weak)) CommandQueue CommandQueue::default_;
|
||||
__attribute__((weak)) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
|
||||
#endif // !_WIN32
|
||||
|
||||
template <typename IteratorType>
|
||||
Buffer::Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator,
|
||||
bool readOnly, bool useHostPtr, cl_int* err) {
|
||||
template <typename IteratorType> Buffer::Buffer(const Context& context, IteratorType startIterator,
|
||||
IteratorType endIterator, bool readOnly,
|
||||
bool useHostPtr, cl_int* err) {
|
||||
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
|
||||
cl_int error;
|
||||
|
||||
@@ -5716,9 +5714,9 @@ inline cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, Iterato
|
||||
* Host to Device.
|
||||
* Uses specified queue.
|
||||
*/
|
||||
template <typename IteratorType>
|
||||
inline cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
|
||||
cl::Buffer& buffer) {
|
||||
template <typename IteratorType> inline cl_int copy(const CommandQueue& queue,
|
||||
IteratorType startIterator,
|
||||
IteratorType endIterator, cl::Buffer& buffer) {
|
||||
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
|
||||
cl_int error;
|
||||
|
||||
|
||||
@@ -1765,9 +1765,8 @@ template <typename T> inline bool operator!=(const Wrapper<T>& lhs, const Wrappe
|
||||
|
||||
|
||||
using BuildLogType =
|
||||
vector<std::pair<cl::Device,
|
||||
typename detail::param_traits<detail::cl_program_build_info,
|
||||
CL_PROGRAM_BUILD_LOG>::param_type>>;
|
||||
vector<std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info,
|
||||
CL_PROGRAM_BUILD_LOG>::param_type>>;
|
||||
#if defined(CL_HPP_ENABLE_EXCEPTIONS)
|
||||
/**
|
||||
* Exception class for build errors to carry build info
|
||||
@@ -2961,12 +2960,10 @@ template <typename IteratorType>
|
||||
cl_int copy(IteratorType startIterator, IteratorType endIterator, cl::Buffer& buffer);
|
||||
template <typename IteratorType>
|
||||
cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, IteratorType endIterator);
|
||||
template <typename IteratorType>
|
||||
cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
|
||||
cl::Buffer& buffer);
|
||||
template <typename IteratorType>
|
||||
cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer, IteratorType startIterator,
|
||||
IteratorType endIterator);
|
||||
template <typename IteratorType> cl_int copy(const CommandQueue& queue, IteratorType startIterator,
|
||||
IteratorType endIterator, cl::Buffer& buffer);
|
||||
template <typename IteratorType> cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer,
|
||||
IteratorType startIterator, IteratorType endIterator);
|
||||
|
||||
|
||||
#if CL_HPP_TARGET_OPENCL_VERSION >= 200
|
||||
@@ -3053,8 +3050,8 @@ template <typename T, class SVMTrait> class SVMAllocator {
|
||||
|
||||
SVMAllocator(const SVMAllocator& other) : context_(other.context_) {}
|
||||
|
||||
template <typename U>
|
||||
SVMAllocator(const SVMAllocator<U, SVMTrait>& other) : context_(other.context_) {}
|
||||
template <typename U> SVMAllocator(const SVMAllocator<U, SVMTrait>& other)
|
||||
: context_(other.context_) {}
|
||||
|
||||
~SVMAllocator() {}
|
||||
|
||||
@@ -3272,9 +3269,9 @@ class Buffer : public Memory {
|
||||
* IteratorType must be random access.
|
||||
* If useHostPtr is specified iterators must represent contiguous data.
|
||||
*/
|
||||
template <typename IteratorType>
|
||||
Buffer(IteratorType startIterator, IteratorType endIterator, bool readOnly,
|
||||
bool useHostPtr = false, cl_int* err = NULL) {
|
||||
template <typename IteratorType> Buffer(IteratorType startIterator, IteratorType endIterator,
|
||||
bool readOnly, bool useHostPtr = false,
|
||||
cl_int* err = NULL) {
|
||||
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
|
||||
cl_int error;
|
||||
|
||||
@@ -3318,17 +3315,17 @@ class Buffer : public Memory {
|
||||
* IteratorType must be random access.
|
||||
* If useHostPtr is specified iterators must represent contiguous data.
|
||||
*/
|
||||
template <typename IteratorType>
|
||||
Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator,
|
||||
bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
|
||||
template <typename IteratorType> Buffer(const Context& context, IteratorType startIterator,
|
||||
IteratorType endIterator, bool readOnly,
|
||||
bool useHostPtr = false, cl_int* err = NULL);
|
||||
|
||||
/*!
|
||||
* \brief Construct a Buffer from a host container via iterators using a specified queue.
|
||||
* If useHostPtr is specified iterators must be random access.
|
||||
*/
|
||||
template <typename IteratorType>
|
||||
Buffer(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
|
||||
bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
|
||||
template <typename IteratorType> Buffer(const CommandQueue& queue, IteratorType startIterator,
|
||||
IteratorType endIterator, bool readOnly,
|
||||
bool useHostPtr = false, cl_int* err = NULL);
|
||||
|
||||
//! \brief Default constructor - initializes to NULL.
|
||||
Buffer() : Memory() {}
|
||||
@@ -4828,8 +4825,7 @@ template <typename T, class Enable = void> struct KernelArgumentHandler;
|
||||
|
||||
// Enable for objects that are not subclasses of memory
|
||||
// Pointers, constants etc
|
||||
template <typename T>
|
||||
struct KernelArgumentHandler<
|
||||
template <typename T> struct KernelArgumentHandler<
|
||||
T, typename std::enable_if<!std::is_base_of<cl::Memory, T>::value>::type> {
|
||||
static size_type size(const T&) { return sizeof(T); }
|
||||
static const T* ptr(const T& value) { return &value; }
|
||||
@@ -4992,9 +4988,8 @@ class Kernel : public detail::Wrapper<cl_kernel> {
|
||||
__GET_KERNEL_ARG_INFO_ERR);
|
||||
}
|
||||
|
||||
template <cl_int name>
|
||||
size_type getSubGroupInfo(const cl::Device& dev, const cl::NDRange& range,
|
||||
cl_int* err = NULL) const {
|
||||
template <cl_int name> size_type getSubGroupInfo(const cl::Device& dev, const cl::NDRange& range,
|
||||
cl_int* err = NULL) const {
|
||||
size_type param;
|
||||
cl_int result = getSubGroupInfo(dev, name, range, ¶m);
|
||||
if (err != NULL) {
|
||||
@@ -5591,9 +5586,8 @@ inline Program linkProgram(vector<Program> inputPrograms, const char* options =
|
||||
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
|
||||
|
||||
// Template specialization for CL_PROGRAM_BINARIES
|
||||
template <>
|
||||
inline cl_int cl::Program::getInfo(cl_program_info name,
|
||||
vector<vector<unsigned char>>* param) const {
|
||||
template <> inline cl_int cl::Program::getInfo(cl_program_info name,
|
||||
vector<vector<unsigned char>>* param) const {
|
||||
if (name != CL_PROGRAM_BINARIES) {
|
||||
return CL_INVALID_VALUE;
|
||||
}
|
||||
@@ -6367,9 +6361,9 @@ class CommandQueue : public detail::Wrapper<cl_command_queue> {
|
||||
* Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer.
|
||||
* This variant takes a raw SVM pointer.
|
||||
*/
|
||||
template <typename T>
|
||||
cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags, size_type size,
|
||||
const vector<Event>* events = NULL, Event* event = NULL) const {
|
||||
template <typename T> cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags,
|
||||
size_type size, const vector<Event>* events = NULL,
|
||||
Event* event = NULL) const {
|
||||
cl_event tmp;
|
||||
cl_int err = detail::errHandler(
|
||||
::clEnqueueSVMMap(
|
||||
@@ -6468,9 +6462,9 @@ class CommandQueue : public detail::Wrapper<cl_command_queue> {
|
||||
* Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime.
|
||||
* This variant takes a cl::pointer instance.
|
||||
*/
|
||||
template <typename T, class D>
|
||||
cl_int enqueueUnmapSVM(cl::pointer<T, D>& ptr, const vector<Event>* events = NULL,
|
||||
Event* event = NULL) const {
|
||||
template <typename T, class D> cl_int enqueueUnmapSVM(cl::pointer<T, D>& ptr,
|
||||
const vector<Event>* events = NULL,
|
||||
Event* event = NULL) const {
|
||||
cl_event tmp;
|
||||
cl_int err = detail::errHandler(
|
||||
::clEnqueueSVMUnmap(
|
||||
@@ -6488,9 +6482,9 @@ class CommandQueue : public detail::Wrapper<cl_command_queue> {
|
||||
* Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime.
|
||||
* This variant takes a cl::vector instance.
|
||||
*/
|
||||
template <typename T, class Alloc>
|
||||
cl_int enqueueUnmapSVM(cl::vector<T, Alloc>& container, const vector<Event>* events = NULL,
|
||||
Event* event = NULL) const {
|
||||
template <typename T, class Alloc> cl_int enqueueUnmapSVM(cl::vector<T, Alloc>& container,
|
||||
const vector<Event>* events = NULL,
|
||||
Event* event = NULL) const {
|
||||
cl_event tmp;
|
||||
cl_int err = detail::errHandler(
|
||||
::clEnqueueSVMUnmap(
|
||||
@@ -6827,8 +6821,9 @@ class DeviceCommandQueue : public detail::Wrapper<cl_command_queue> {
|
||||
cl::Context context = cl::Context::getDefault();
|
||||
cl::Device device = cl::Device::getDefault();
|
||||
|
||||
cl_command_queue_properties mergedProperties = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
|
||||
CL_QUEUE_ON_DEVICE | static_cast<cl_command_queue_properties>(properties);
|
||||
cl_command_queue_properties mergedProperties =
|
||||
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE |
|
||||
static_cast<cl_command_queue_properties>(properties);
|
||||
|
||||
cl_queue_properties queue_properties[] = {CL_QUEUE_PROPERTIES, mergedProperties, 0};
|
||||
object_ = ::clCreateCommandQueueWithProperties(context(), device(), queue_properties, &error);
|
||||
@@ -6847,8 +6842,9 @@ class DeviceCommandQueue : public detail::Wrapper<cl_command_queue> {
|
||||
cl_int* err = NULL) {
|
||||
cl_int error;
|
||||
|
||||
cl_command_queue_properties mergedProperties = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
|
||||
CL_QUEUE_ON_DEVICE | static_cast<cl_command_queue_properties>(properties);
|
||||
cl_command_queue_properties mergedProperties =
|
||||
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE |
|
||||
static_cast<cl_command_queue_properties>(properties);
|
||||
cl_queue_properties queue_properties[] = {CL_QUEUE_PROPERTIES, mergedProperties, 0};
|
||||
object_ = ::clCreateCommandQueueWithProperties(context(), device(), queue_properties, &error);
|
||||
|
||||
@@ -6866,8 +6862,9 @@ class DeviceCommandQueue : public detail::Wrapper<cl_command_queue> {
|
||||
cl_int* err = NULL) {
|
||||
cl_int error;
|
||||
|
||||
cl_command_queue_properties mergedProperties = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
|
||||
CL_QUEUE_ON_DEVICE | static_cast<cl_command_queue_properties>(properties);
|
||||
cl_command_queue_properties mergedProperties =
|
||||
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE |
|
||||
static_cast<cl_command_queue_properties>(properties);
|
||||
cl_queue_properties queue_properties[] = {CL_QUEUE_PROPERTIES, mergedProperties, CL_QUEUE_SIZE,
|
||||
queueSize, 0};
|
||||
object_ = ::clCreateCommandQueueWithProperties(context(), device(), queue_properties, &error);
|
||||
@@ -7021,9 +7018,9 @@ template <> struct KernelArgumentHandler<cl::DeviceCommandQueue, void> {
|
||||
#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
|
||||
|
||||
|
||||
template <typename IteratorType>
|
||||
Buffer::Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator,
|
||||
bool readOnly, bool useHostPtr, cl_int* err) {
|
||||
template <typename IteratorType> Buffer::Buffer(const Context& context, IteratorType startIterator,
|
||||
IteratorType endIterator, bool readOnly,
|
||||
bool useHostPtr, cl_int* err) {
|
||||
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
|
||||
cl_int error;
|
||||
|
||||
@@ -7163,9 +7160,9 @@ inline void* enqueueMapBuffer(const Buffer& buffer, cl_bool blocking, cl_map_fla
|
||||
* update a region of a coarse-grained SVM buffer.
|
||||
* This variant takes a raw SVM pointer.
|
||||
*/
|
||||
template <typename T>
|
||||
inline cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags, size_type size,
|
||||
const vector<Event>* events, Event* event) {
|
||||
template <typename T> inline cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags,
|
||||
size_type size, const vector<Event>* events,
|
||||
Event* event) {
|
||||
cl_int error;
|
||||
CommandQueue queue = CommandQueue::getDefault(&error);
|
||||
if (error != CL_SUCCESS) {
|
||||
@@ -7180,10 +7177,10 @@ inline cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags, size_t
|
||||
* update a region of a coarse-grained SVM buffer.
|
||||
* This variant takes a cl::pointer instance.
|
||||
*/
|
||||
template <typename T, class D>
|
||||
inline cl_int enqueueMapSVM(cl::pointer<T, D> ptr, cl_bool blocking, cl_map_flags flags,
|
||||
size_type size, const vector<Event>* events = NULL,
|
||||
Event* event = NULL) {
|
||||
template <typename T, class D> inline cl_int enqueueMapSVM(cl::pointer<T, D> ptr, cl_bool blocking,
|
||||
cl_map_flags flags, size_type size,
|
||||
const vector<Event>* events = NULL,
|
||||
Event* event = NULL) {
|
||||
cl_int error;
|
||||
CommandQueue queue = CommandQueue::getDefault(&error);
|
||||
if (error != CL_SUCCESS) {
|
||||
@@ -7257,9 +7254,9 @@ inline cl_int enqueueUnmapSVM(T* ptr, const vector<Event>* events = NULL, Event*
|
||||
* SVM buffer back to the OpenCL runtime.
|
||||
* This variant takes a cl::pointer instance.
|
||||
*/
|
||||
template <typename T, class D>
|
||||
inline cl_int enqueueUnmapSVM(cl::pointer<T, D>& ptr, const vector<Event>* events = NULL,
|
||||
Event* event = NULL) {
|
||||
template <typename T, class D> inline cl_int enqueueUnmapSVM(cl::pointer<T, D>& ptr,
|
||||
const vector<Event>* events = NULL,
|
||||
Event* event = NULL) {
|
||||
cl_int error;
|
||||
CommandQueue queue = CommandQueue::getDefault(&error);
|
||||
if (error != CL_SUCCESS) {
|
||||
@@ -7275,9 +7272,9 @@ inline cl_int enqueueUnmapSVM(cl::pointer<T, D>& ptr, const vector<Event>* event
|
||||
* SVM buffer back to the OpenCL runtime.
|
||||
* This variant takes a cl::vector instance.
|
||||
*/
|
||||
template <typename T, class Alloc>
|
||||
inline cl_int enqueueUnmapSVM(cl::vector<T, Alloc>& container, const vector<Event>* events = NULL,
|
||||
Event* event = NULL) {
|
||||
template <typename T, class Alloc> inline cl_int enqueueUnmapSVM(cl::vector<T, Alloc>& container,
|
||||
const vector<Event>* events = NULL,
|
||||
Event* event = NULL) {
|
||||
cl_int error;
|
||||
CommandQueue queue = CommandQueue::getDefault(&error);
|
||||
if (error != CL_SUCCESS) {
|
||||
@@ -7336,9 +7333,9 @@ inline cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, Iterato
|
||||
* Host to Device.
|
||||
* Uses specified queue.
|
||||
*/
|
||||
template <typename IteratorType>
|
||||
inline cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
|
||||
cl::Buffer& buffer) {
|
||||
template <typename IteratorType> inline cl_int copy(const CommandQueue& queue,
|
||||
IteratorType startIterator,
|
||||
IteratorType endIterator, cl::Buffer& buffer) {
|
||||
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
|
||||
cl_int error;
|
||||
|
||||
|
||||
@@ -67,7 +67,7 @@ extern "C" {
|
||||
#define CL_EXT_SUFFIX__VERSION_1_1 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
|
||||
#define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED \
|
||||
CL_EXTENSION_WEAK_LINK \
|
||||
AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7
|
||||
AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7
|
||||
|
||||
#ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
|
||||
#define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
|
||||
@@ -76,7 +76,7 @@ extern "C" {
|
||||
#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
|
||||
#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED \
|
||||
CL_EXTENSION_WEAK_LINK \
|
||||
AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8
|
||||
AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8
|
||||
#else
|
||||
#warning This path should never happen outside of internal operating system development. AvailabilityMacros do not function correctly here!
|
||||
#define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
|
||||
|
||||
@@ -1009,9 +1009,9 @@ inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, long)
|
||||
* does not work, because when using a derived type (e.g. Context) the generic
|
||||
* template will provide a better match.
|
||||
*/
|
||||
template <typename Func, typename T>
|
||||
inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, int,
|
||||
typename T::cl_type = 0) {
|
||||
template <typename Func, typename T> inline cl_int getInfoHelper(Func f, cl_uint name,
|
||||
VECTOR_CLASS<T>* param, int,
|
||||
typename T::cl_type = 0) {
|
||||
::size_t required;
|
||||
cl_int err = f(name, 0, NULL, &required);
|
||||
if (err != CL_SUCCESS) {
|
||||
@@ -2736,12 +2736,10 @@ template <typename IteratorType>
|
||||
cl_int copy(IteratorType startIterator, IteratorType endIterator, cl::Buffer& buffer);
|
||||
template <typename IteratorType>
|
||||
cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, IteratorType endIterator);
|
||||
template <typename IteratorType>
|
||||
cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
|
||||
cl::Buffer& buffer);
|
||||
template <typename IteratorType>
|
||||
cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer, IteratorType startIterator,
|
||||
IteratorType endIterator);
|
||||
template <typename IteratorType> cl_int copy(const CommandQueue& queue, IteratorType startIterator,
|
||||
IteratorType endIterator, cl::Buffer& buffer);
|
||||
template <typename IteratorType> cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer,
|
||||
IteratorType startIterator, IteratorType endIterator);
|
||||
|
||||
|
||||
/*! \brief Class interface for Buffer Memory Objects.
|
||||
@@ -2797,9 +2795,9 @@ class Buffer : public Memory {
|
||||
* IteratorType must be random access.
|
||||
* If useHostPtr is specified iterators must represent contiguous data.
|
||||
*/
|
||||
template <typename IteratorType>
|
||||
Buffer(IteratorType startIterator, IteratorType endIterator, bool readOnly,
|
||||
bool useHostPtr = false, cl_int* err = NULL) {
|
||||
template <typename IteratorType> Buffer(IteratorType startIterator, IteratorType endIterator,
|
||||
bool readOnly, bool useHostPtr = false,
|
||||
cl_int* err = NULL) {
|
||||
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
|
||||
cl_int error;
|
||||
|
||||
@@ -2843,17 +2841,17 @@ class Buffer : public Memory {
|
||||
* IteratorType must be random access.
|
||||
* If useHostPtr is specified iterators must represent contiguous data.
|
||||
*/
|
||||
template <typename IteratorType>
|
||||
Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator,
|
||||
bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
|
||||
template <typename IteratorType> Buffer(const Context& context, IteratorType startIterator,
|
||||
IteratorType endIterator, bool readOnly,
|
||||
bool useHostPtr = false, cl_int* err = NULL);
|
||||
|
||||
/*!
|
||||
* \brief Construct a Buffer from a host container via iterators using a specified queue.
|
||||
* If useHostPtr is specified iterators must represent contiguous data.
|
||||
*/
|
||||
template <typename IteratorType>
|
||||
Buffer(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
|
||||
bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
|
||||
template <typename IteratorType> Buffer(const CommandQueue& queue, IteratorType startIterator,
|
||||
IteratorType endIterator, bool readOnly,
|
||||
bool useHostPtr = false, cl_int* err = NULL);
|
||||
|
||||
//! \brief Default constructor - initializes to NULL.
|
||||
Buffer() : Memory() {}
|
||||
@@ -5314,8 +5312,8 @@ class CommandQueue : public detail::Wrapper<cl_command_queue> {
|
||||
const VECTOR_CLASS<const void*>* mem_locs = NULL,
|
||||
const VECTOR_CLASS<Event>* events = NULL, Event* event = NULL) const {
|
||||
cl_mem* mems = (mem_objects != NULL && mem_objects->size() > 0)
|
||||
? (cl_mem*)alloca(mem_objects->size() * sizeof(cl_mem))
|
||||
: NULL;
|
||||
? (cl_mem*)alloca(mem_objects->size() * sizeof(cl_mem))
|
||||
: NULL;
|
||||
|
||||
if (mems != NULL) {
|
||||
for (unsigned int i = 0; i < mem_objects->size(); i++) {
|
||||
@@ -5505,9 +5503,9 @@ __attribute__((weak)) CommandQueue CommandQueue::default_;
|
||||
__attribute__((weak)) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
|
||||
#endif // !_WIN32
|
||||
|
||||
template <typename IteratorType>
|
||||
Buffer::Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator,
|
||||
bool readOnly, bool useHostPtr, cl_int* err) {
|
||||
template <typename IteratorType> Buffer::Buffer(const Context& context, IteratorType startIterator,
|
||||
IteratorType endIterator, bool readOnly,
|
||||
bool useHostPtr, cl_int* err) {
|
||||
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
|
||||
cl_int error;
|
||||
|
||||
@@ -5709,9 +5707,9 @@ inline cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, Iterato
|
||||
* Host to Device.
|
||||
* Uses specified queue.
|
||||
*/
|
||||
template <typename IteratorType>
|
||||
inline cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
|
||||
cl::Buffer& buffer) {
|
||||
template <typename IteratorType> inline cl_int copy(const CommandQueue& queue,
|
||||
IteratorType startIterator,
|
||||
IteratorType endIterator, cl::Buffer& buffer) {
|
||||
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
|
||||
cl_int error;
|
||||
|
||||
|
||||
@@ -1753,9 +1753,8 @@ template <typename T> inline bool operator!=(const Wrapper<T>& lhs, const Wrappe
|
||||
|
||||
|
||||
using BuildLogType =
|
||||
vector<std::pair<cl::Device,
|
||||
typename detail::param_traits<detail::cl_program_build_info,
|
||||
CL_PROGRAM_BUILD_LOG>::param_type>>;
|
||||
vector<std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info,
|
||||
CL_PROGRAM_BUILD_LOG>::param_type>>;
|
||||
#if defined(CL_HPP_ENABLE_EXCEPTIONS)
|
||||
/**
|
||||
* Exception class for build errors to carry build info
|
||||
@@ -2951,12 +2950,10 @@ template <typename IteratorType>
|
||||
cl_int copy(IteratorType startIterator, IteratorType endIterator, cl::Buffer& buffer);
|
||||
template <typename IteratorType>
|
||||
cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, IteratorType endIterator);
|
||||
template <typename IteratorType>
|
||||
cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
|
||||
cl::Buffer& buffer);
|
||||
template <typename IteratorType>
|
||||
cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer, IteratorType startIterator,
|
||||
IteratorType endIterator);
|
||||
template <typename IteratorType> cl_int copy(const CommandQueue& queue, IteratorType startIterator,
|
||||
IteratorType endIterator, cl::Buffer& buffer);
|
||||
template <typename IteratorType> cl_int copy(const CommandQueue& queue, const cl::Buffer& buffer,
|
||||
IteratorType startIterator, IteratorType endIterator);
|
||||
|
||||
|
||||
#if CL_HPP_TARGET_OPENCL_VERSION >= 200
|
||||
@@ -3043,8 +3040,8 @@ template <typename T, class SVMTrait> class SVMAllocator {
|
||||
|
||||
SVMAllocator(const SVMAllocator& other) : context_(other.context_) {}
|
||||
|
||||
template <typename U>
|
||||
SVMAllocator(const SVMAllocator<U, SVMTrait>& other) : context_(other.context_) {}
|
||||
template <typename U> SVMAllocator(const SVMAllocator<U, SVMTrait>& other)
|
||||
: context_(other.context_) {}
|
||||
|
||||
~SVMAllocator() {}
|
||||
|
||||
@@ -3262,9 +3259,9 @@ class Buffer : public Memory {
|
||||
* IteratorType must be random access.
|
||||
* If useHostPtr is specified iterators must represent contiguous data.
|
||||
*/
|
||||
template <typename IteratorType>
|
||||
Buffer(IteratorType startIterator, IteratorType endIterator, bool readOnly,
|
||||
bool useHostPtr = false, cl_int* err = NULL) {
|
||||
template <typename IteratorType> Buffer(IteratorType startIterator, IteratorType endIterator,
|
||||
bool readOnly, bool useHostPtr = false,
|
||||
cl_int* err = NULL) {
|
||||
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
|
||||
cl_int error;
|
||||
|
||||
@@ -3308,17 +3305,17 @@ class Buffer : public Memory {
|
||||
* IteratorType must be random access.
|
||||
* If useHostPtr is specified iterators must represent contiguous data.
|
||||
*/
|
||||
template <typename IteratorType>
|
||||
Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator,
|
||||
bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
|
||||
template <typename IteratorType> Buffer(const Context& context, IteratorType startIterator,
|
||||
IteratorType endIterator, bool readOnly,
|
||||
bool useHostPtr = false, cl_int* err = NULL);
|
||||
|
||||
/*!
|
||||
* \brief Construct a Buffer from a host container via iterators using a specified queue.
|
||||
* If useHostPtr is specified iterators must be random access.
|
||||
*/
|
||||
template <typename IteratorType>
|
||||
Buffer(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
|
||||
bool readOnly, bool useHostPtr = false, cl_int* err = NULL);
|
||||
template <typename IteratorType> Buffer(const CommandQueue& queue, IteratorType startIterator,
|
||||
IteratorType endIterator, bool readOnly,
|
||||
bool useHostPtr = false, cl_int* err = NULL);
|
||||
|
||||
//! \brief Default constructor - initializes to NULL.
|
||||
Buffer() : Memory() {}
|
||||
@@ -4818,8 +4815,7 @@ template <typename T, class Enable = void> struct KernelArgumentHandler;
|
||||
|
||||
// Enable for objects that are not subclasses of memory
|
||||
// Pointers, constants etc
|
||||
template <typename T>
|
||||
struct KernelArgumentHandler<
|
||||
template <typename T> struct KernelArgumentHandler<
|
||||
T, typename std::enable_if<!std::is_base_of<cl::Memory, T>::value>::type> {
|
||||
static size_type size(const T&) { return sizeof(T); }
|
||||
static const T* ptr(const T& value) { return &value; }
|
||||
@@ -4982,9 +4978,8 @@ class Kernel : public detail::Wrapper<cl_kernel> {
|
||||
__GET_KERNEL_ARG_INFO_ERR);
|
||||
}
|
||||
|
||||
template <cl_int name>
|
||||
size_type getSubGroupInfo(const cl::Device& dev, const cl::NDRange& range,
|
||||
cl_int* err = NULL) const {
|
||||
template <cl_int name> size_type getSubGroupInfo(const cl::Device& dev, const cl::NDRange& range,
|
||||
cl_int* err = NULL) const {
|
||||
size_type param;
|
||||
cl_int result = getSubGroupInfo(dev, name, range, ¶m);
|
||||
if (err != NULL) {
|
||||
@@ -5581,9 +5576,8 @@ inline Program linkProgram(vector<Program> inputPrograms, const char* options =
|
||||
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
|
||||
|
||||
// Template specialization for CL_PROGRAM_BINARIES
|
||||
template <>
|
||||
inline cl_int cl::Program::getInfo(cl_program_info name,
|
||||
vector<vector<unsigned char>>* param) const {
|
||||
template <> inline cl_int cl::Program::getInfo(cl_program_info name,
|
||||
vector<vector<unsigned char>>* param) const {
|
||||
if (name != CL_PROGRAM_BINARIES) {
|
||||
return CL_INVALID_VALUE;
|
||||
}
|
||||
@@ -6357,9 +6351,9 @@ class CommandQueue : public detail::Wrapper<cl_command_queue> {
|
||||
* Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer.
|
||||
* This variant takes a raw SVM pointer.
|
||||
*/
|
||||
template <typename T>
|
||||
cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags, size_type size,
|
||||
const vector<Event>* events = NULL, Event* event = NULL) const {
|
||||
template <typename T> cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags,
|
||||
size_type size, const vector<Event>* events = NULL,
|
||||
Event* event = NULL) const {
|
||||
cl_event tmp;
|
||||
cl_int err = detail::errHandler(
|
||||
::clEnqueueSVMMap(
|
||||
@@ -6458,9 +6452,9 @@ class CommandQueue : public detail::Wrapper<cl_command_queue> {
|
||||
* Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime.
|
||||
* This variant takes a cl::pointer instance.
|
||||
*/
|
||||
template <typename T, class D>
|
||||
cl_int enqueueUnmapSVM(cl::pointer<T, D>& ptr, const vector<Event>* events = NULL,
|
||||
Event* event = NULL) const {
|
||||
template <typename T, class D> cl_int enqueueUnmapSVM(cl::pointer<T, D>& ptr,
|
||||
const vector<Event>* events = NULL,
|
||||
Event* event = NULL) const {
|
||||
cl_event tmp;
|
||||
cl_int err = detail::errHandler(
|
||||
::clEnqueueSVMUnmap(
|
||||
@@ -6478,9 +6472,9 @@ class CommandQueue : public detail::Wrapper<cl_command_queue> {
|
||||
* Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime.
|
||||
* This variant takes a cl::vector instance.
|
||||
*/
|
||||
template <typename T, class Alloc>
|
||||
cl_int enqueueUnmapSVM(cl::vector<T, Alloc>& container, const vector<Event>* events = NULL,
|
||||
Event* event = NULL) const {
|
||||
template <typename T, class Alloc> cl_int enqueueUnmapSVM(cl::vector<T, Alloc>& container,
|
||||
const vector<Event>* events = NULL,
|
||||
Event* event = NULL) const {
|
||||
cl_event tmp;
|
||||
cl_int err = detail::errHandler(
|
||||
::clEnqueueSVMUnmap(
|
||||
@@ -6817,8 +6811,9 @@ class DeviceCommandQueue : public detail::Wrapper<cl_command_queue> {
|
||||
cl::Context context = cl::Context::getDefault();
|
||||
cl::Device device = cl::Device::getDefault();
|
||||
|
||||
cl_command_queue_properties mergedProperties = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
|
||||
CL_QUEUE_ON_DEVICE | static_cast<cl_command_queue_properties>(properties);
|
||||
cl_command_queue_properties mergedProperties =
|
||||
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE |
|
||||
static_cast<cl_command_queue_properties>(properties);
|
||||
|
||||
cl_queue_properties queue_properties[] = {CL_QUEUE_PROPERTIES, mergedProperties, 0};
|
||||
object_ = ::clCreateCommandQueueWithProperties(context(), device(), queue_properties, &error);
|
||||
@@ -6837,8 +6832,9 @@ class DeviceCommandQueue : public detail::Wrapper<cl_command_queue> {
|
||||
cl_int* err = NULL) {
|
||||
cl_int error;
|
||||
|
||||
cl_command_queue_properties mergedProperties = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
|
||||
CL_QUEUE_ON_DEVICE | static_cast<cl_command_queue_properties>(properties);
|
||||
cl_command_queue_properties mergedProperties =
|
||||
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE |
|
||||
static_cast<cl_command_queue_properties>(properties);
|
||||
cl_queue_properties queue_properties[] = {CL_QUEUE_PROPERTIES, mergedProperties, 0};
|
||||
object_ = ::clCreateCommandQueueWithProperties(context(), device(), queue_properties, &error);
|
||||
|
||||
@@ -6856,8 +6852,9 @@ class DeviceCommandQueue : public detail::Wrapper<cl_command_queue> {
|
||||
cl_int* err = NULL) {
|
||||
cl_int error;
|
||||
|
||||
cl_command_queue_properties mergedProperties = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
|
||||
CL_QUEUE_ON_DEVICE | static_cast<cl_command_queue_properties>(properties);
|
||||
cl_command_queue_properties mergedProperties =
|
||||
CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE |
|
||||
static_cast<cl_command_queue_properties>(properties);
|
||||
cl_queue_properties queue_properties[] = {CL_QUEUE_PROPERTIES, mergedProperties, CL_QUEUE_SIZE,
|
||||
queueSize, 0};
|
||||
object_ = ::clCreateCommandQueueWithProperties(context(), device(), queue_properties, &error);
|
||||
@@ -7011,9 +7008,9 @@ template <> struct KernelArgumentHandler<cl::DeviceCommandQueue, void> {
|
||||
#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
|
||||
|
||||
|
||||
template <typename IteratorType>
|
||||
Buffer::Buffer(const Context& context, IteratorType startIterator, IteratorType endIterator,
|
||||
bool readOnly, bool useHostPtr, cl_int* err) {
|
||||
template <typename IteratorType> Buffer::Buffer(const Context& context, IteratorType startIterator,
|
||||
IteratorType endIterator, bool readOnly,
|
||||
bool useHostPtr, cl_int* err) {
|
||||
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
|
||||
cl_int error;
|
||||
|
||||
@@ -7153,9 +7150,9 @@ inline void* enqueueMapBuffer(const Buffer& buffer, cl_bool blocking, cl_map_fla
|
||||
* update a region of a coarse-grained SVM buffer.
|
||||
* This variant takes a raw SVM pointer.
|
||||
*/
|
||||
template <typename T>
|
||||
inline cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags, size_type size,
|
||||
const vector<Event>* events, Event* event) {
|
||||
template <typename T> inline cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags,
|
||||
size_type size, const vector<Event>* events,
|
||||
Event* event) {
|
||||
cl_int error;
|
||||
CommandQueue queue = CommandQueue::getDefault(&error);
|
||||
if (error != CL_SUCCESS) {
|
||||
@@ -7170,10 +7167,10 @@ inline cl_int enqueueMapSVM(T* ptr, cl_bool blocking, cl_map_flags flags, size_t
|
||||
* update a region of a coarse-grained SVM buffer.
|
||||
* This variant takes a cl::pointer instance.
|
||||
*/
|
||||
template <typename T, class D>
|
||||
inline cl_int enqueueMapSVM(cl::pointer<T, D> ptr, cl_bool blocking, cl_map_flags flags,
|
||||
size_type size, const vector<Event>* events = NULL,
|
||||
Event* event = NULL) {
|
||||
template <typename T, class D> inline cl_int enqueueMapSVM(cl::pointer<T, D> ptr, cl_bool blocking,
|
||||
cl_map_flags flags, size_type size,
|
||||
const vector<Event>* events = NULL,
|
||||
Event* event = NULL) {
|
||||
cl_int error;
|
||||
CommandQueue queue = CommandQueue::getDefault(&error);
|
||||
if (error != CL_SUCCESS) {
|
||||
@@ -7247,9 +7244,9 @@ inline cl_int enqueueUnmapSVM(T* ptr, const vector<Event>* events = NULL, Event*
|
||||
* SVM buffer back to the OpenCL runtime.
|
||||
* This variant takes a cl::pointer instance.
|
||||
*/
|
||||
template <typename T, class D>
|
||||
inline cl_int enqueueUnmapSVM(cl::pointer<T, D>& ptr, const vector<Event>* events = NULL,
|
||||
Event* event = NULL) {
|
||||
template <typename T, class D> inline cl_int enqueueUnmapSVM(cl::pointer<T, D>& ptr,
|
||||
const vector<Event>* events = NULL,
|
||||
Event* event = NULL) {
|
||||
cl_int error;
|
||||
CommandQueue queue = CommandQueue::getDefault(&error);
|
||||
if (error != CL_SUCCESS) {
|
||||
@@ -7265,9 +7262,9 @@ inline cl_int enqueueUnmapSVM(cl::pointer<T, D>& ptr, const vector<Event>* event
|
||||
* SVM buffer back to the OpenCL runtime.
|
||||
* This variant takes a cl::vector instance.
|
||||
*/
|
||||
template <typename T, class Alloc>
|
||||
inline cl_int enqueueUnmapSVM(cl::vector<T, Alloc>& container, const vector<Event>* events = NULL,
|
||||
Event* event = NULL) {
|
||||
template <typename T, class Alloc> inline cl_int enqueueUnmapSVM(cl::vector<T, Alloc>& container,
|
||||
const vector<Event>* events = NULL,
|
||||
Event* event = NULL) {
|
||||
cl_int error;
|
||||
CommandQueue queue = CommandQueue::getDefault(&error);
|
||||
if (error != CL_SUCCESS) {
|
||||
@@ -7326,9 +7323,9 @@ inline cl_int copy(const cl::Buffer& buffer, IteratorType startIterator, Iterato
|
||||
* Host to Device.
|
||||
* Uses specified queue.
|
||||
*/
|
||||
template <typename IteratorType>
|
||||
inline cl_int copy(const CommandQueue& queue, IteratorType startIterator, IteratorType endIterator,
|
||||
cl::Buffer& buffer) {
|
||||
template <typename IteratorType> inline cl_int copy(const CommandQueue& queue,
|
||||
IteratorType startIterator,
|
||||
IteratorType endIterator, cl::Buffer& buffer) {
|
||||
typedef typename std::iterator_traits<IteratorType>::value_type DataType;
|
||||
cl_int error;
|
||||
|
||||
|
||||
@@ -126,7 +126,7 @@ void OCLDX11YUY2::run(void) {
|
||||
BYTE* pLine = (BYTE*)LockedRectD11.pData + y * LockedRectD11.RowPitch;
|
||||
|
||||
BYTE* pLineUV = (BYTE*)LockedRectD11.pData + y * LockedRectD11.RowPitch +
|
||||
OCLDX11YUY2::HEIGHT * LockedRectD11.RowPitch;
|
||||
OCLDX11YUY2::HEIGHT * LockedRectD11.RowPitch;
|
||||
|
||||
for (int x = 0; x < OCLDX11YUY2::WIDTH; x++) {
|
||||
*pLine++ = 0x7F; // Y
|
||||
|
||||
@@ -265,7 +265,7 @@ void OCLPerfGenericBandwidth::run(void) {
|
||||
// We have one extra write per LDS location to initialize LDS
|
||||
double perf = ((double)global * (numReads_ * sizeof(cl_float) + dataSizeBytes_ / 64) * NUM_ITER *
|
||||
(double)(1e-09)) /
|
||||
sec;
|
||||
sec;
|
||||
|
||||
_perfInfo = (float)perf;
|
||||
SNPRINTF(buf, sizeof(buf), " %6s %9s %8d threads, %3d reads (GB/s) ", buf2, buf3, global,
|
||||
|
||||
@@ -401,8 +401,8 @@ void OCLPerfKernelThroughput::open(unsigned int test, char* units, double& conve
|
||||
input2BufferSize_ = static_cast<size_t>(matrixDim2_ * matrixDim1_ * sizeof(float));
|
||||
output1BufferSize_ = static_cast<size_t>(matrixDim1_ * matrixDim1_ * sizeof(float));
|
||||
_reqDataSize = (1.0 * matrixDim1_ * matrixDim2_ * sizeof(float)) +
|
||||
(1.0 * matrixDim2_ * matrixDim1_ * sizeof(float)) +
|
||||
(1.0 * matrixDim1_ * matrixDim1_ * sizeof(float));
|
||||
(1.0 * matrixDim2_ * matrixDim1_ * sizeof(float)) +
|
||||
(1.0 * matrixDim1_ * matrixDim1_ * sizeof(float));
|
||||
break;
|
||||
case 1: // Flops/Byte
|
||||
flopsPerByte_ = (int)workSize[workSizeIdx_]; // for kernelType == 0
|
||||
@@ -695,13 +695,13 @@ void OCLPerfKernelThroughput::run(void) {
|
||||
// printf("FlopCount = 2*%i*%i*%i=%f\n",
|
||||
// matrixDim1_,matrixDim1_,matrixDim2_,flopCount);
|
||||
bandwidth_ = (float)(1.f * _reqDataSize / 1024.f / 1024.f / 1024.f) * 1000000.f /
|
||||
avgKernelTime_; // GB/s
|
||||
avgKernelTime_; // GB/s
|
||||
gflops_ = (float)(1000000.f * flopCount / avgKernelTime_ / 1000000000.0);
|
||||
break;
|
||||
case 1: // Madds
|
||||
flopCount = _reqDataSize * flopsPerByte_;
|
||||
bandwidth_ = (float)(1.f * _reqDataSize / 1024.f / 1024.f / 1024.f) * 1000000.f /
|
||||
avgKernelTime_; // GB/s
|
||||
avgKernelTime_; // GB/s
|
||||
gflops_ = bandwidth_ * flopsPerByte_;
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -341,7 +341,7 @@ void OCLPerfLDSReadSpeed::run(void) {
|
||||
// We have one extra write per LDS location to initialize LDS
|
||||
double perf = ((double)global * (numReads_ * sizeof(cl_float) + ldsSizeBytes_ / 64) * NUM_ITER *
|
||||
(double)(1e-09)) /
|
||||
sec;
|
||||
sec;
|
||||
|
||||
_perfInfo = (float)perf;
|
||||
SNPRINTF(buf, sizeof(buf), " %s %8d threads, %3d reads (GB/s) ", buf2, global, numReads_);
|
||||
|
||||
@@ -749,10 +749,9 @@ void OCLPerfMandelbrot::run(void) {
|
||||
// printf(" totalIter = %lld\n", totalIters);
|
||||
if (isAMD && (type_ == CL_DEVICE_TYPE_GPU)) {
|
||||
CHECK_RESULT((totalIters != expectedIters[_openTest]) &&
|
||||
(totalIters !=
|
||||
expectedIters[(_openTest < FMA_EXPECTEDVALUES_INDEX
|
||||
? _openTest + FMA_EXPECTEDVALUES_INDEX
|
||||
: _openTest)]),
|
||||
(totalIters != expectedIters[(_openTest < FMA_EXPECTEDVALUES_INDEX
|
||||
? _openTest + FMA_EXPECTEDVALUES_INDEX
|
||||
: _openTest)]),
|
||||
"Incorrect iteration count detected!");
|
||||
} else {
|
||||
CHECK_RESULT(totalIters != expectedItersNV[_openTest], "Incorrect iteration count detected!");
|
||||
@@ -869,11 +868,9 @@ void OCLPerfAsyncMandelbrot::run(void) {
|
||||
// printf(" totalIter = %lld\n", totalIters);
|
||||
if (isAMD && (type_ == CL_DEVICE_TYPE_GPU)) {
|
||||
CHECK_RESULT((totalIters != 2 * expectedIters[_openTest]) &&
|
||||
(totalIters !=
|
||||
2 *
|
||||
expectedIters[(_openTest < FMA_EXPECTEDVALUES_INDEX
|
||||
? _openTest + FMA_EXPECTEDVALUES_INDEX
|
||||
: _openTest)]),
|
||||
(totalIters != 2 * expectedIters[(_openTest < FMA_EXPECTEDVALUES_INDEX
|
||||
? _openTest + FMA_EXPECTEDVALUES_INDEX
|
||||
: _openTest)]),
|
||||
"Incorrect iteration count detected!");
|
||||
} else {
|
||||
CHECK_RESULT(totalIters != 2 * expectedItersNV[_openTest],
|
||||
|
||||
@@ -40,7 +40,7 @@ const static char* strKernel = KERNEL_CODE(
|
||||
/* The purpose of this is to introduce an additional zero at stage - pass
|
||||
* bit*/
|
||||
const uint leftID = (thread & (pairDistance - 1)) |
|
||||
((thread & ~(pairDistance - 1)) << 1); /* Is the same as below */
|
||||
((thread & ~(pairDistance - 1)) << 1); /* Is the same as below */
|
||||
|
||||
const uint direction = ((thread >> stage) & 1) == 1 ? 0 : 1;
|
||||
|
||||
|
||||
@@ -183,8 +183,8 @@ void OCLMultiQueue::open(unsigned int test, char* units, double& conversion,
|
||||
sizeof(maxComputeUnits), &maxComputeUnits, NULL);
|
||||
computePower *= 32 * maxComputeUnits;
|
||||
NumElements = (NumElements < static_cast<size_t>(computePower))
|
||||
? static_cast<size_t>(computePower)
|
||||
: NumElements;
|
||||
? static_cast<size_t>(computePower)
|
||||
: NumElements;
|
||||
program_ = _wrapper->clCreateProgramWithSource(context_, 1, &strKernel, NULL, &error_);
|
||||
CHECK_RESULT((error_ != CL_SUCCESS), "clCreateProgramWithSource() failed");
|
||||
error_ = _wrapper->clBuildProgram(program_, 1, &devices_[deviceId], NULL, NULL, NULL);
|
||||
|
||||
@@ -140,8 +140,8 @@ int main(int argc, char** argv) {
|
||||
|
||||
bool isAMDPlatform = (strcmp(platform.getInfo<CL_PLATFORM_NAME>().c_str(),
|
||||
"AMD Accelerated Parallel Processing") == 0)
|
||||
? true
|
||||
: false;
|
||||
? true
|
||||
: false;
|
||||
if (isAMDPlatform) {
|
||||
std::string boardName;
|
||||
device.getInfo(CL_DEVICE_BOARD_NAME_AMD, &boardName);
|
||||
|
||||
@@ -188,7 +188,7 @@ bool setAliasOptionVariable(int OptDescTableIx, Options& Opts, int64_t IValue, c
|
||||
if (OptDescTableIx == OID_SaveTemps) {
|
||||
// Dump .cl, .i(.ii), .amdil, .isa, .s, dll, calimage
|
||||
flags = DUMP_CL | DUMP_I | DUMP_S | DUMP_O | DUMP_DLL | DUMP_CGIL | DUMP_DEBUGIL | DUMP_IL |
|
||||
DUMP_ISA;
|
||||
DUMP_ISA;
|
||||
} else if (OptDescTableIx == OID_SaveTempsAll) {
|
||||
flags = DUMP_ALL;
|
||||
} else { // OID_Output
|
||||
@@ -531,7 +531,8 @@ int getOptionDesc(std::string& options, size_t StartPos, bool IsShortForm, Optio
|
||||
}
|
||||
|
||||
char next_c = options.at(pos);
|
||||
bool optionalHasValue = (OPTION_value(od) == OVA_OPTIONAL) &&
|
||||
bool optionalHasValue =
|
||||
(OPTION_value(od) == OVA_OPTIONAL) &&
|
||||
(((OPTION_info(od) & OA_SEPARATOR_EQUAL) && (next_c == '=')) ||
|
||||
((OPTION_info(od) & OA_SEPARATOR_NONE) && !OPTION_valueSeparator(next_c)));
|
||||
bool hasValue = (OPTION_value(od) == OVA_REQUIRED) || optionalHasValue;
|
||||
|
||||
@@ -339,9 +339,9 @@ const Isa* Isa::findIsa(uint32_t versionMajor, uint32_t versionMinor, uint32_t v
|
||||
auto supportedIsas_ = supportedIsas();
|
||||
auto isaIter = std::find_if(supportedIsas_.first, supportedIsas_.second, [&](const Isa& isa) {
|
||||
return versionMajor == isa.versionMajor_ && versionMinor == isa.versionMinor_ &&
|
||||
versionStepping == isa.versionStepping_ &&
|
||||
(isa.sramecc_ == amd::Isa::Feature::Unsupported || isa.sramecc_ == sramecc) &&
|
||||
(isa.xnack_ == amd::Isa::Feature::Unsupported || isa.xnack_ == xnack);
|
||||
versionStepping == isa.versionStepping_ &&
|
||||
(isa.sramecc_ == amd::Isa::Feature::Unsupported || isa.sramecc_ == sramecc) &&
|
||||
(isa.xnack_ == amd::Isa::Feature::Unsupported || isa.xnack_ == xnack);
|
||||
});
|
||||
return isaIter == supportedIsas_.second ? nullptr : isaIter;
|
||||
}
|
||||
@@ -1132,7 +1132,7 @@ bool Device::IpcCreate(void* dev_ptr, size_t* mem_size, char* handle, size_t* me
|
||||
|
||||
// Calculate the memory offset from the original base ptr
|
||||
*mem_offset = reinterpret_cast<address>(dev_ptr) - reinterpret_cast<address>(orig_dev_ptr) +
|
||||
amd_mem_obj->getOffset();
|
||||
amd_mem_obj->getOffset();
|
||||
|
||||
*mem_size = amd_mem_obj->getSize();
|
||||
|
||||
|
||||
@@ -1763,8 +1763,8 @@ class Device : public RuntimeObject {
|
||||
return (info().svmCapabilities_ &
|
||||
(CL_DEVICE_SVM_COARSE_GRAIN_BUFFER | CL_DEVICE_SVM_FINE_GRAIN_BUFFER |
|
||||
CL_DEVICE_SVM_FINE_GRAIN_SYSTEM)) != 0
|
||||
? true
|
||||
: false;
|
||||
? true
|
||||
: false;
|
||||
}
|
||||
|
||||
//! check svm FGS support capability.
|
||||
|
||||
@@ -769,8 +769,8 @@ static inline uint32_t GetOclArgumentTypeOCL(const aclArgData* argInfo, bool* is
|
||||
return amd::KernelParameterDescriptor::QueueObject;
|
||||
case ARG_TYPE_VALUE:
|
||||
return (argInfo->arg.value.data == DATATYPE_struct)
|
||||
? amd::KernelParameterDescriptor::ReferenceObject
|
||||
: amd::KernelParameterDescriptor::ValueObject;
|
||||
? amd::KernelParameterDescriptor::ReferenceObject
|
||||
: amd::KernelParameterDescriptor::ValueObject;
|
||||
case ARG_TYPE_IMAGE:
|
||||
return amd::KernelParameterDescriptor::ImageObject;
|
||||
case ARG_TYPE_SAMPLER:
|
||||
|
||||
@@ -511,8 +511,8 @@ bool Program::compileAndLinkExecutable(const amd_comgr_data_set_t inputs,
|
||||
if (status == AMD_COMGR_STATUS_SUCCESS) {
|
||||
hasRelocatableData = true;
|
||||
amd_comgr_action_kind_t kind = (continueCompileFrom == FILE_TYPE_ASM_TEXT)
|
||||
? AMD_COMGR_ACTION_ASSEMBLE_SOURCE_TO_RELOCATABLE
|
||||
: AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE;
|
||||
? AMD_COMGR_ACTION_ASSEMBLE_SOURCE_TO_RELOCATABLE
|
||||
: AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE;
|
||||
status = amd::Comgr::do_action(kind, action, inputs, relocatableData);
|
||||
extractBuildLog(relocatableData);
|
||||
}
|
||||
@@ -1259,9 +1259,9 @@ bool Program::linkImplHSAIL(amd::option::Options* options) {
|
||||
bool finalize = true;
|
||||
internal_ = (compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;
|
||||
// If !binaryElf_ then program must have been created using clCreateProgramWithBinary
|
||||
aclType continueCompileFrom = (!binaryElf_)
|
||||
? static_cast<aclType>(getNextCompilationStageFromBinary(options))
|
||||
: ACL_TYPE_LLVMIR_BINARY;
|
||||
aclType continueCompileFrom =
|
||||
(!binaryElf_) ? static_cast<aclType>(getNextCompilationStageFromBinary(options))
|
||||
: ACL_TYPE_LLVMIR_BINARY;
|
||||
|
||||
switch (continueCompileFrom) {
|
||||
case ACL_TYPE_SPIRV_BINARY:
|
||||
@@ -2857,9 +2857,8 @@ bool Program::getDemangledName(const std::string& mangledName, std::string& dema
|
||||
|
||||
demangledName.resize(demangled_size);
|
||||
|
||||
if (AMD_COMGR_STATUS_SUCCESS !=
|
||||
amd::Comgr::get_data(demangled_data, &demangled_size,
|
||||
const_cast<char*>(demangledName.data()))) {
|
||||
if (AMD_COMGR_STATUS_SUCCESS != amd::Comgr::get_data(demangled_data, &demangled_size,
|
||||
const_cast<char*>(demangledName.data()))) {
|
||||
amd::Comgr::release_data(mangled_data);
|
||||
amd::Comgr::release_data(demangled_data);
|
||||
return false;
|
||||
|
||||
@@ -2166,18 +2166,18 @@ bool KernelBlitManager::fillBuffer(device::Memory& memory, const void* pattern,
|
||||
for (auto& packed_obj : packed_vector) {
|
||||
constexpr uint32_t kFillType = FillBufferAligned;
|
||||
uint32_t kpattern_size = (packed_obj.pattern_expanded_)
|
||||
? HostBlitManager::FillBufferInfo::kExtendedSize
|
||||
: patternSize;
|
||||
? HostBlitManager::FillBufferInfo::kExtendedSize
|
||||
: patternSize;
|
||||
size_t kfill_size = packed_obj.fill_size_ / kpattern_size;
|
||||
uint64_t koffset = overall_offset;
|
||||
overall_offset += packed_obj.fill_size_;
|
||||
|
||||
size_t globalWorkOffset[3] = {0, 0, 0};
|
||||
uint32_t alignment = (kpattern_size & 0xf) == 0 ? 2 * sizeof(uint64_t)
|
||||
: (kpattern_size & 0x7) == 0 ? sizeof(uint64_t)
|
||||
: (kpattern_size & 0x3) == 0 ? sizeof(uint32_t)
|
||||
: (kpattern_size & 0x1) == 0 ? sizeof(uint16_t)
|
||||
: sizeof(uint8_t);
|
||||
uint32_t alignment = (kpattern_size & 0xf) == 0 ? 2 * sizeof(uint64_t)
|
||||
: (kpattern_size & 0x7) == 0 ? sizeof(uint64_t)
|
||||
: (kpattern_size & 0x3) == 0 ? sizeof(uint32_t)
|
||||
: (kpattern_size & 0x1) == 0 ? sizeof(uint16_t)
|
||||
: sizeof(uint8_t);
|
||||
|
||||
// Program kernels arguments for the fill operation
|
||||
Memory* mem = &gpuMem(memory);
|
||||
@@ -2302,9 +2302,8 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern,
|
||||
constexpr size_t kFillImageThreshold = 256 * 256;
|
||||
|
||||
// Use host fill if memory has direct access and image is small
|
||||
if (setup_.disableFillImage_ ||
|
||||
(gpuMem(memory).isHostMemDirectAccess() &&
|
||||
(size.c[0] * size.c[1] * size.c[2]) <= kFillImageThreshold)) {
|
||||
if (setup_.disableFillImage_ || (gpuMem(memory).isHostMemDirectAccess() &&
|
||||
(size.c[0] * size.c[1] * size.c[2]) <= kFillImageThreshold)) {
|
||||
gpu().releaseGpuMemoryFence();
|
||||
|
||||
result = HostBlitManager::fillImage(memory, pattern, origin, size, entire);
|
||||
|
||||
@@ -194,7 +194,7 @@ bool PalCounterReference::finalize() {
|
||||
|
||||
assert(layout.sampleCount == numExpCounters_);
|
||||
size_t size = sizeof(Pal::GlobalCounterLayout) +
|
||||
(sizeof(Pal::GlobalSampleLayout) * (layout.sampleCount - 1));
|
||||
(sizeof(Pal::GlobalSampleLayout) * (layout.sampleCount - 1));
|
||||
layout_ = reinterpret_cast<Pal::GlobalCounterLayout*>(new char[size]);
|
||||
if (layout_ != nullptr) {
|
||||
layout_->sampleCount = layout.sampleCount;
|
||||
@@ -728,7 +728,7 @@ bool PerfCounter::create() {
|
||||
}
|
||||
counter_start = info_.counterIndex_;
|
||||
counter_step = dev().properties().gfxipProperties.shaderCore.numShaderArrays *
|
||||
dev().properties().gfxipProperties.shaderCore.numShaderEngines;
|
||||
dev().properties().gfxipProperties.shaderCore.numShaderEngines;
|
||||
break;
|
||||
|
||||
case PCIndexSelect::ComputeUnit:
|
||||
|
||||
@@ -111,8 +111,8 @@ static std::tuple<const amd::Isa*, const char*> findIsa(uint32_t gfxipMajor, uin
|
||||
auto palDeviceIter = std::find_if(std::begin(supportedPalDevices), std::end(supportedPalDevices),
|
||||
[&](const PalDevice& palDevice) {
|
||||
return palDevice.gfxipMajor_ == gfxipMajor &&
|
||||
palDevice.gfxipMinor_ == gfxipMinor &&
|
||||
palDevice.gfxipStepping_ == (gfxipStepping & 0xF);
|
||||
palDevice.gfxipMinor_ == gfxipMinor &&
|
||||
palDevice.gfxipStepping_ == (gfxipStepping & 0xF);
|
||||
});
|
||||
if (palDeviceIter == std::end(supportedPalDevices)) {
|
||||
return std::make_tuple(nullptr, nullptr);
|
||||
@@ -131,8 +131,8 @@ static std::tuple<Pal::GfxIpLevel, Pal::AsicRevision, const char*> findPal(uint3
|
||||
auto palDeviceIter = std::find_if(std::begin(supportedPalDevices), std::end(supportedPalDevices),
|
||||
[&](const PalDevice& palDevice) {
|
||||
return palDevice.gfxipMajor_ == gfxipMajor &&
|
||||
palDevice.gfxipMinor_ == gfxipMinor &&
|
||||
palDevice.gfxipStepping_ == (gfxipStepping & 0xF);
|
||||
palDevice.gfxipMinor_ == gfxipMinor &&
|
||||
palDevice.gfxipStepping_ == (gfxipStepping & 0xF);
|
||||
});
|
||||
if (palDeviceIter == std::end(supportedPalDevices)) {
|
||||
return std::make_tuple(Pal::GfxIpLevel::None, Pal::AsicRevision::Unknown, nullptr);
|
||||
@@ -351,8 +351,8 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
|
||||
info_.maxWorkItemDimensions_ = 3;
|
||||
|
||||
info_.maxComputeUnits_ = settings().enableWgpMode_
|
||||
? palProp.gfxipProperties.shaderCore.numAvailableCus / 2
|
||||
: palProp.gfxipProperties.shaderCore.numAvailableCus;
|
||||
? palProp.gfxipProperties.shaderCore.numAvailableCus / 2
|
||||
: palProp.gfxipProperties.shaderCore.numAvailableCus;
|
||||
info_.maxPhysicalComputeUnits_ = info_.maxComputeUnits_;
|
||||
info_.numberOfShaderEngines = palProp.gfxipProperties.shaderCore.numShaderEngines;
|
||||
|
||||
@@ -371,11 +371,11 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
|
||||
info_.nativeVectorWidthHalf_ = info_.preferredVectorWidthHalf_ = 0; // no half support
|
||||
|
||||
info_.maxEngineClockFrequency_ = (palProp.gfxipProperties.performance.maxGpuClock != 0)
|
||||
? palProp.gfxipProperties.performance.maxGpuClock
|
||||
: 555;
|
||||
? palProp.gfxipProperties.performance.maxGpuClock
|
||||
: 555;
|
||||
info_.maxMemoryClockFrequency_ = (palProp.gpuMemoryProperties.performance.maxMemClock != 0)
|
||||
? palProp.gpuMemoryProperties.performance.maxMemClock
|
||||
: 555;
|
||||
? palProp.gpuMemoryProperties.performance.maxMemClock
|
||||
: 555;
|
||||
info_.wallClockFrequency_ = palProp.timestampFrequency / 1000; // in KHz
|
||||
info_.vramBusBitWidth_ = palProp.gpuMemoryProperties.performance.vramBusBitWidth;
|
||||
info_.l2CacheSize_ = palProp.gfxipProperties.shaderCore.tccSizeInBytes;
|
||||
@@ -417,8 +417,8 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
|
||||
|
||||
uint uswcPercentAvailable =
|
||||
((static_cast<uint64_t>(heaps[Pal::GpuHeapGartUswc].logicalSize) / Mi) > 1536 && IS_WINDOWS)
|
||||
? 75
|
||||
: 50;
|
||||
? 75
|
||||
: 50;
|
||||
if (settings().apuSystem_) {
|
||||
info_.globalMemSize_ +=
|
||||
(static_cast<uint64_t>(heaps[Pal::GpuHeapGartUswc].logicalSize) * uswcPercentAvailable) /
|
||||
@@ -622,8 +622,8 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
|
||||
info_.deviceTopology_.pcie.function = palProp.pciProperties.functionNumber;
|
||||
|
||||
info_.simdPerCU_ = settings().enableWgpMode_
|
||||
? (2 * palProp.gfxipProperties.shaderCore.numSimdsPerCu)
|
||||
: palProp.gfxipProperties.shaderCore.numSimdsPerCu;
|
||||
? (2 * palProp.gfxipProperties.shaderCore.numSimdsPerCu)
|
||||
: palProp.gfxipProperties.shaderCore.numSimdsPerCu;
|
||||
info_.cuPerShaderArray_ = palProp.gfxipProperties.shaderCore.numCusPerShaderArray;
|
||||
info_.simdWidth_ = isa().simdWidth();
|
||||
info_.simdInstructionWidth_ = 1;
|
||||
@@ -656,7 +656,7 @@ void NullDevice::fillDeviceInfo(const Pal::DeviceProperties& palProp,
|
||||
info_.pcieDeviceId_ = palProp.deviceId;
|
||||
info_.pcieRevisionId_ = palProp.revisionId;
|
||||
info_.maxThreadsPerCU_ = info_.wavefrontWidth_ * info_.simdPerCU_ *
|
||||
palProp.gfxipProperties.shaderCore.numWavefrontsPerSimd;
|
||||
palProp.gfxipProperties.shaderCore.numWavefrontsPerSimd;
|
||||
|
||||
info_.cooperativeGroups_ = settings().enableCoopGroups_;
|
||||
info_.cooperativeMultiDeviceGroups_ = settings().enableCoopMultiDeviceGroups_;
|
||||
@@ -906,8 +906,8 @@ bool Device::create(Pal::IDevice* device) {
|
||||
// Save the IP level for the offline detection
|
||||
ipLevel_ = properties().gfxLevel;
|
||||
asicRevision_ = flagIsDefault(PAL_FORCE_ASIC_REVISION)
|
||||
? properties().revision
|
||||
: static_cast<Pal::AsicRevision>(PAL_FORCE_ASIC_REVISION);
|
||||
? properties().revision
|
||||
: static_cast<Pal::AsicRevision>(PAL_FORCE_ASIC_REVISION);
|
||||
|
||||
// XNACK flag should be set for PageMigration or IOMMUv2 support.
|
||||
bool isXNACKEnabled =
|
||||
@@ -1284,10 +1284,9 @@ device::VirtualDevice* Device::createVirtualDevice(amd::CommandQueue* queue) {
|
||||
if (queue != nullptr) {
|
||||
profiling = queue->properties().test(CL_QUEUE_PROFILING_ENABLE);
|
||||
if (queue->asHostQueue() != nullptr) {
|
||||
bool interopQueue = (0 !=
|
||||
(queue->context().info().flags_ &
|
||||
(amd::Context::GLDeviceKhr | amd::Context::D3D10DeviceKhr |
|
||||
amd::Context::D3D11DeviceKhr)));
|
||||
bool interopQueue = (0 != (queue->context().info().flags_ &
|
||||
(amd::Context::GLDeviceKhr | amd::Context::D3D10DeviceKhr |
|
||||
amd::Context::D3D11DeviceKhr)));
|
||||
rtCUs = queue->rtCUs();
|
||||
} else if (queue->asDeviceQueue() != nullptr) {
|
||||
deviceQueueSize = queue->asDeviceQueue()->size();
|
||||
@@ -1439,9 +1438,9 @@ bool Device::init() {
|
||||
// Count up all the devices in the system.
|
||||
platform_->EnumerateDevices(&gNumDevices, &gDeviceList[0]);
|
||||
|
||||
const char* requestedDeviceList = amd::IS_HIP
|
||||
? ((HIP_VISIBLE_DEVICES[0] != '\0') ? HIP_VISIBLE_DEVICES : CUDA_VISIBLE_DEVICES)
|
||||
: GPU_DEVICE_ORDINAL;
|
||||
const char* requestedDeviceList =
|
||||
amd::IS_HIP ? ((HIP_VISIBLE_DEVICES[0] != '\0') ? HIP_VISIBLE_DEVICES : CUDA_VISIBLE_DEVICES)
|
||||
: GPU_DEVICE_ORDINAL;
|
||||
|
||||
if (requestedDeviceList[0] != '\0') {
|
||||
useDeviceList = true;
|
||||
@@ -1611,8 +1610,8 @@ pal::Memory* Device::createBuffer(amd::Memory& owner, bool directAccess) const {
|
||||
|
||||
Resource::MemoryType type =
|
||||
(owner.forceSysMemAlloc() || (owner.getMemFlags() & CL_MEM_SVM_FINE_GRAIN_BUFFER))
|
||||
? Resource::Remote
|
||||
: Resource::Local;
|
||||
? Resource::Remote
|
||||
: Resource::Local;
|
||||
|
||||
// Check if runtime can force a tiny buffer into USWC memory
|
||||
if ((size <= (GPU_MAX_REMOTE_MEM_SIZE * Ki)) && (type == Resource::Local) &&
|
||||
@@ -1633,8 +1632,8 @@ pal::Memory* Device::createBuffer(amd::Memory& owner, bool directAccess) const {
|
||||
// Internal means VirtualDevice!=nullptr
|
||||
bool internalAlloc =
|
||||
((owner.getMemFlags() & CL_MEM_USE_HOST_PTR) && (owner.getVirtualDevice() != nullptr))
|
||||
? true
|
||||
: false;
|
||||
? true
|
||||
: false;
|
||||
|
||||
// Create a memory object
|
||||
gpuMemory = new pal::Buffer(*this, owner, owner.getSize());
|
||||
@@ -1918,9 +1917,9 @@ device::Memory* Device::createMemory(amd::Memory& owner) const {
|
||||
(memory->memoryType() != Resource::ExternalPhysical) &&
|
||||
((owner.getHostMem() != nullptr) ||
|
||||
((nullptr != owner.parent()) && (owner.getHostMem() != nullptr)))) {
|
||||
bool ok = memory->pinSystemMemory(
|
||||
owner.getHostMem(),
|
||||
(owner.getHostMemRef()->size()) ? owner.getHostMemRef()->size() : owner.getSize());
|
||||
bool ok = memory->pinSystemMemory(owner.getHostMem(), (owner.getHostMemRef()->size())
|
||||
? owner.getHostMemRef()->size()
|
||||
: owner.getSize());
|
||||
//! \note: Ignore the pinning result for now
|
||||
}
|
||||
|
||||
@@ -2067,7 +2066,8 @@ bool Device::globalFreeMemory(size_t* freeMemory) const {
|
||||
// Allocated system memory without cached allocations. Cache size contains all allocations, so
|
||||
// don't count persistent and local
|
||||
Pal::gpusize system_memory = allocedMem[Pal::GpuHeapGartCacheable] +
|
||||
allocedMem[Pal::GpuHeapGartUswc] + cache_group_local - resourceCache().cacheSize();
|
||||
allocedMem[Pal::GpuHeapGartUswc] + cache_group_local -
|
||||
resourceCache().cacheSize();
|
||||
|
||||
#if IS_WINDOWS
|
||||
// Second, query OS for overall memory usage on the system
|
||||
@@ -2091,7 +2091,7 @@ bool Device::globalFreeMemory(size_t* freeMemory) const {
|
||||
if (mem_budget_info.usage[Pal::GpuHeapGroupNonLocal] >
|
||||
(resourceCache().cacheSize() - cache_group_local)) {
|
||||
system_total_alloced = mem_budget_info.usage[Pal::GpuHeapGroupNonLocal] + cache_group_local -
|
||||
resourceCache().cacheSize();
|
||||
resourceCache().cacheSize();
|
||||
}
|
||||
// System usage exceeds per process usage for system memory
|
||||
if (system_total_alloced > system_memory) {
|
||||
@@ -2102,9 +2102,10 @@ bool Device::globalFreeMemory(size_t* freeMemory) const {
|
||||
// Third, finalize reported free memory
|
||||
|
||||
// Fill free memory info
|
||||
freeMemory[TotalFreeMemory] = (total_alloced > info().globalMemSize_)
|
||||
? 0
|
||||
: static_cast<size_t>((info().globalMemSize_ - total_alloced) / Ki);
|
||||
freeMemory[TotalFreeMemory] =
|
||||
(total_alloced > info().globalMemSize_)
|
||||
? 0
|
||||
: static_cast<size_t>((info().globalMemSize_ - total_alloced) / Ki);
|
||||
|
||||
freeMemory[TotalFreeMemory] -=
|
||||
(freeMemory[TotalFreeMemory] > HIP_HIDDEN_FREE_MEM * Ki) ? HIP_HIDDEN_FREE_MEM * Ki : 0;
|
||||
@@ -2842,8 +2843,8 @@ bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeI
|
||||
(Pal::Result::Success ==
|
||||
(iDev()->SetClockMode(setClockMode,
|
||||
reinterpret_cast<Pal::SetClockModeOutput*>(pSetClockModeOutput))))
|
||||
? true
|
||||
: false;
|
||||
? true
|
||||
: false;
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
@@ -490,10 +490,10 @@ class Device : public NullDevice {
|
||||
//! Returns the number of available compute rings
|
||||
uint numExclusiveComputeEngines() const {
|
||||
return exclusiveComputeEnginesId_.size() +
|
||||
((exclusiveComputeEnginesId().find(ExclusiveQueueType::RealTime1) ==
|
||||
exclusiveComputeEnginesId().end())
|
||||
? 1
|
||||
: 0);
|
||||
((exclusiveComputeEnginesId().find(ExclusiveQueueType::RealTime1) ==
|
||||
exclusiveComputeEnginesId().end())
|
||||
? 1
|
||||
: 0);
|
||||
}
|
||||
|
||||
//! Returns the map of available exclusive compute rings with the engine index
|
||||
|
||||
@@ -59,7 +59,7 @@ bool Device::associateD3D9Device(void* d3d9Device) {
|
||||
|
||||
// match the adapter
|
||||
bool canInteroperate = (properties().osProperties.luidHighPart == d3d9deviceLuid.HighPart) &&
|
||||
(properties().osProperties.luidLowPart == d3d9deviceLuid.LowPart);
|
||||
(properties().osProperties.luidLowPart == d3d9deviceLuid.LowPart);
|
||||
|
||||
return canInteroperate;
|
||||
}
|
||||
|
||||
@@ -782,8 +782,8 @@ bool Device::glCanInterop(void* GLplatformContext, void* GLdeviceContext) const
|
||||
if (wglGetContextGPUInfoAMD(hRC, &glAdapterLuid, &glChainBitMask)) {
|
||||
// match the adapter
|
||||
canInteroperate = (properties().osProperties.luidHighPart == glAdapterLuid.HighPart) &&
|
||||
(properties().osProperties.luidLowPart == glAdapterLuid.LowPart) &&
|
||||
((1 << properties().gpuIndex) == glChainBitMask);
|
||||
(properties().osProperties.luidLowPart == glAdapterLuid.LowPart) &&
|
||||
((1 << properties().gpuIndex) == glChainBitMask);
|
||||
}
|
||||
#else
|
||||
GLuint glDeviceId = 0;
|
||||
@@ -797,9 +797,9 @@ bool Device::glCanInterop(void* GLplatformContext, void* GLdeviceContext) const
|
||||
if (pfnMesaGLInteropGLXQueryDeviceInfo(disp, ctx, &info) == 0) {
|
||||
// match the adapter
|
||||
canInteroperate = (properties().pciProperties.busNumber == info.pci_bus) &&
|
||||
(properties().pciProperties.deviceNumber == info.pci_device) &&
|
||||
(properties().pciProperties.functionNumber == info.pci_function) &&
|
||||
(static_cast<GLuint>(1 << properties().gpuIndex) == glChainMask);
|
||||
(properties().pciProperties.deviceNumber == info.pci_device) &&
|
||||
(properties().pciProperties.functionNumber == info.pci_function) &&
|
||||
(static_cast<GLuint>(1 << properties().gpuIndex) == glChainMask);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -620,8 +620,8 @@ Pal::Result RgpCaptureMgr::BeginRGPTrace(VirtualGPU* gpu) {
|
||||
if (result == Pal::Result::Success) {
|
||||
GpuUtil::SampleTraceApiInfo sample_trace_api_info = {};
|
||||
sample_trace_api_info.instructionTraceMode = (inst_tracing_enabled_)
|
||||
? GpuUtil::InstructionTraceMode::FullFrame
|
||||
: GpuUtil::InstructionTraceMode::Disabled;
|
||||
? GpuUtil::InstructionTraceMode::FullFrame
|
||||
: GpuUtil::InstructionTraceMode::Disabled;
|
||||
trace_.gpa_session_->SetSampleTraceApiInfo(sample_trace_api_info, trace_.gpa_sample_id_);
|
||||
}
|
||||
|
||||
|
||||
@@ -167,7 +167,7 @@ bool HSAILKernel::init() {
|
||||
// Find total workgroup size
|
||||
if (workGroupInfo_.compileSize_[0] != 0) {
|
||||
workGroupInfo_.size_ = workGroupInfo_.compileSize_[0] * workGroupInfo_.compileSize_[1] *
|
||||
workGroupInfo_.compileSize_[2];
|
||||
workGroupInfo_.compileSize_[2];
|
||||
} else {
|
||||
workGroupInfo_.size_ = device().info().preferredWorkGroupSize_;
|
||||
}
|
||||
|
||||
@@ -367,8 +367,8 @@ bool Memory::createInterop() {
|
||||
vkRes.nt_handle_ = ((ext_memory->Type() != amd::ExternalMemory::HandleType::OpaqueFd) &&
|
||||
(ext_memory->Type() != amd::ExternalMemory::HandleType::OpaqueWin32Kmt) &&
|
||||
(ext_memory->Type() != amd::ExternalMemory::HandleType::D3D11ResourceKmt))
|
||||
? true
|
||||
: false;
|
||||
? true
|
||||
: false;
|
||||
}
|
||||
|
||||
else if (glObject != nullptr) {
|
||||
|
||||
@@ -289,8 +289,8 @@ size_t PrintfDbg::outputArgument(const std::string& fmt, bool printFloat, size_t
|
||||
case 4:
|
||||
if (printFloat) {
|
||||
const float fArg = size == 2
|
||||
? amd::half2float(*(reinterpret_cast<const uint16_t*>(argument)))
|
||||
: *(reinterpret_cast<const float*>(argument));
|
||||
? amd::half2float(*(reinterpret_cast<const uint16_t*>(argument)))
|
||||
: *(reinterpret_cast<const float*>(argument));
|
||||
static const char* fSpecifiers = "eEfgGa";
|
||||
std::string fmtF = fmt;
|
||||
size_t posS = fmtF.find_first_of("%");
|
||||
@@ -327,13 +327,12 @@ size_t PrintfDbg::outputArgument(const std::string& fmt, bool printFloat, size_t
|
||||
hhFmt.erase(hhFmt.find_first_of("h"), 2);
|
||||
amd::Os::printf(hhFmt.data(), *(reinterpret_cast<const unsigned char*>(argument)));
|
||||
} else if (hlModifier) {
|
||||
amd::Os::printf(hlFmt.data(),
|
||||
size == 2 ? *(reinterpret_cast<const uint16_t*>(argument))
|
||||
: *(reinterpret_cast<const uint32_t*>(argument)));
|
||||
amd::Os::printf(hlFmt.data(), size == 2
|
||||
? *(reinterpret_cast<const uint16_t*>(argument))
|
||||
: *(reinterpret_cast<const uint32_t*>(argument)));
|
||||
} else {
|
||||
amd::Os::printf(fmt.data(),
|
||||
size == 2 ? *(reinterpret_cast<const uint16_t*>(argument))
|
||||
: *(reinterpret_cast<const uint32_t*>(argument)));
|
||||
amd::Os::printf(fmt.data(), size == 2 ? *(reinterpret_cast<const uint16_t*>(argument))
|
||||
: *(reinterpret_cast<const uint32_t*>(argument)));
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
@@ -305,7 +305,7 @@ Resource::Resource(const Device& gpuDev, size_t size)
|
||||
desc_.state_ = 0;
|
||||
desc_.type_ = Empty;
|
||||
desc_.width_ = amd::alignUp(size, Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint)) /
|
||||
Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint);
|
||||
Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint);
|
||||
desc_.height_ = 1;
|
||||
desc_.depth_ = 1;
|
||||
desc_.mipLevels_ = 1;
|
||||
@@ -859,9 +859,8 @@ bool Resource::CreateInterop(CreateParams* params) {
|
||||
size_t imageSize;
|
||||
size_t gpuMemSize;
|
||||
|
||||
if (Pal::Result::Success !=
|
||||
dev().iDev()->GetExternalSharedImageSizes(imgOpenInfo, &imageSize, &gpuMemSize,
|
||||
&imgCreateInfo)) {
|
||||
if (Pal::Result::Success != dev().iDev()->GetExternalSharedImageSizes(
|
||||
imgOpenInfo, &imageSize, &gpuMemSize, &imgCreateInfo)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1327,8 +1326,8 @@ bool Resource::create(MemoryType memType, CreateParams* params, bool forceLinear
|
||||
createInfo.size = desc().width_ * elementSize_;
|
||||
createInfo.size = amd::alignUp(createInfo.size, MaxGpuAlignment);
|
||||
createInfo.alignment = (params && params->alignment_ != 0)
|
||||
? params->alignment_
|
||||
: (desc().scratch_ ? 64 * Ki : MaxGpuAlignment);
|
||||
? params->alignment_
|
||||
: (desc().scratch_ ? 64 * Ki : MaxGpuAlignment);
|
||||
createInfo.vaRange = Pal::VaRange::Default;
|
||||
createInfo.priority = Pal::GpuMemPriority::Normal;
|
||||
|
||||
@@ -1388,7 +1387,7 @@ void Resource::free() {
|
||||
}
|
||||
|
||||
const bool wait = (memoryType() != ImageView) && (memoryType() != ImageBuffer) &&
|
||||
(memoryType() != ImageExternalBuffer) && (memoryType() != View);
|
||||
(memoryType() != ImageExternalBuffer) && (memoryType() != View);
|
||||
|
||||
// OCL has to wait, even if resource is placed in the cache, since reallocation can occur
|
||||
// and resource can be reused on another async queue without a wait on a busy operation
|
||||
@@ -1519,8 +1518,8 @@ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin,
|
||||
}
|
||||
|
||||
bool cp_dma = dev().settings().disableSdma_ ||
|
||||
(!enableCopyRect && desc().buffer_ && dstResource.desc().buffer_ &&
|
||||
(size[0] < dev().settings().cpDmaCopySizeMax_));
|
||||
(!enableCopyRect && desc().buffer_ && dstResource.desc().buffer_ &&
|
||||
(size[0] < dev().settings().cpDmaCopySizeMax_));
|
||||
if (cp_dma) {
|
||||
// Make sure compute is done before CP DMA start
|
||||
gpu.addBarrier(RgpSqqtBarrierReason::MemDependency, BarrierType::KernelToCopy);
|
||||
@@ -1563,9 +1562,9 @@ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin,
|
||||
}
|
||||
copyRegion.gpuMemoryOffset = gpuMemoryOffset;
|
||||
copyRegion.gpuMemoryRowPitch = gpuMemoryRowPitch;
|
||||
copyRegion.gpuMemoryDepthPitch = (srcOrigin[2])
|
||||
? srcOrigin[2]
|
||||
: copyRegion.gpuMemoryRowPitch * copyRegion.imageExtent.height;
|
||||
copyRegion.gpuMemoryDepthPitch =
|
||||
(srcOrigin[2]) ? srcOrigin[2]
|
||||
: copyRegion.gpuMemoryRowPitch * copyRegion.imageExtent.height;
|
||||
gpu.iCmd()->CmdCopyMemoryToImage(*iMem(), *dstResource.image_, imgLayout, 1, ©Region);
|
||||
} else if (!desc().buffer_ && dstResource.desc().buffer_) {
|
||||
Pal::MemoryImageCopyRegion copyRegion = {};
|
||||
@@ -1588,9 +1587,9 @@ bool Resource::partialMemCopyTo(VirtualGPU& gpu, const amd::Coord3D& srcOrigin,
|
||||
}
|
||||
copyRegion.gpuMemoryOffset = gpuMemoryOffset;
|
||||
copyRegion.gpuMemoryRowPitch = gpuMemoryRowPitch;
|
||||
copyRegion.gpuMemoryDepthPitch = (dstOrigin[2])
|
||||
? dstOrigin[2]
|
||||
: copyRegion.gpuMemoryRowPitch * copyRegion.imageExtent.height;
|
||||
copyRegion.gpuMemoryDepthPitch =
|
||||
(dstOrigin[2]) ? dstOrigin[2]
|
||||
: copyRegion.gpuMemoryRowPitch * copyRegion.imageExtent.height;
|
||||
gpu.iCmd()->CmdCopyImageToMemory(*image_, imgLayout, *dstResource.iMem(), 1, ©Region);
|
||||
} else {
|
||||
if (enableCopyRect) {
|
||||
|
||||
@@ -424,7 +424,7 @@ class Resource : public amd::HeapObject {
|
||||
memRef_ = viewOwner_->memRef_;
|
||||
memRef_->retain();
|
||||
desc_.width_ = amd::alignUp(size, Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint)) /
|
||||
Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint);
|
||||
Pal::Formats::BytesPerPixel(Pal::ChNumFormat::X32_Uint);
|
||||
setBusy(*memRef()->gpu_, GpuEvent::InvalidID);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -341,9 +341,8 @@ bool Settings::create(const Pal::DeviceProperties& palProp,
|
||||
#endif
|
||||
}
|
||||
|
||||
if (apuSystem_ &&
|
||||
((heaps[Pal::GpuHeapLocal].logicalSize + heaps[Pal::GpuHeapInvisible].logicalSize) <
|
||||
(150 * Mi))) {
|
||||
if (apuSystem_ && ((heaps[Pal::GpuHeapLocal].logicalSize +
|
||||
heaps[Pal::GpuHeapInvisible].logicalSize) < (150 * Mi))) {
|
||||
remoteAlloc_ = true;
|
||||
}
|
||||
|
||||
|
||||
@@ -896,7 +896,8 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
|
||||
// \todo forces PAL to reuse CBs, but requires postamble
|
||||
createInfo.flags.autoMemoryReuse = false;
|
||||
createInfo.allocInfo[Pal::CommandDataAlloc].allocHeap = Pal::GpuHeapGartUswc;
|
||||
createInfo.allocInfo[Pal::CommandDataAlloc].suballocSize = VirtualGPU::Queue::MaxCommands *
|
||||
createInfo.allocInfo[Pal::CommandDataAlloc].suballocSize =
|
||||
VirtualGPU::Queue::MaxCommands *
|
||||
(320 + ((profiling) ? 96 : 0) + ((dev().captureMgr() != nullptr) ? 512 : 0));
|
||||
createInfo.allocInfo[Pal::CommandDataAlloc].allocSize =
|
||||
dev().settings().maxCmdBuffers_ * createInfo.allocInfo[Pal::CommandDataAlloc].suballocSize;
|
||||
@@ -925,8 +926,8 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
|
||||
|
||||
uint idx = index() % dev().numComputeEngines();
|
||||
uint64_t residency_limit = dev().properties().gpuMemoryProperties.flags.supportPerSubmitMemRefs
|
||||
? 0
|
||||
: (dev().properties().gpuMemoryProperties.maxLocalMemSize >> 2);
|
||||
? 0
|
||||
: (dev().properties().gpuMemoryProperties.maxLocalMemSize >> 2);
|
||||
uint max_cmd_buffers = dev().settings().maxCmdBuffers_;
|
||||
|
||||
if (dev().numComputeEngines()) {
|
||||
@@ -937,8 +938,8 @@ bool VirtualGPU::create(bool profiling, uint deviceQueueSize, uint rtCUs,
|
||||
}
|
||||
const auto& info = dev().QueuePool().find(queues_[MainEngine]->iQueue_);
|
||||
hwRing_ = (info != dev().QueuePool().end())
|
||||
? info->second->index_
|
||||
: (index() % dev().numExclusiveComputeEngines()) + GPU_MAX_HW_QUEUES;
|
||||
? info->second->index_
|
||||
: (index() % dev().numExclusiveComputeEngines()) + GPU_MAX_HW_QUEUES;
|
||||
|
||||
// Check if device has SDMA engines
|
||||
if (dev().numDMAEngines() != 0 && !dev().settings().disableSdma_) {
|
||||
@@ -2158,7 +2159,7 @@ void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& vcmd) {
|
||||
amd::Memory* dstMemory = amd::MemObjMap::FindMemObj(vcmd.dst());
|
||||
assert(dstMemory && "No svm Buffer to fill with!");
|
||||
size_t offset = reinterpret_cast<uintptr_t>(vcmd.dst()) -
|
||||
reinterpret_cast<uintptr_t>(dstMemory->getSvmPtr());
|
||||
reinterpret_cast<uintptr_t>(dstMemory->getSvmPtr());
|
||||
|
||||
pal::Memory* memory = dev().getGpuMemory(dstMemory);
|
||||
|
||||
@@ -2828,15 +2829,13 @@ void VirtualGPU::submitExternalSemaphoreCmd(amd::ExternalSemaphoreCmd& cmd) {
|
||||
|
||||
if (cmd.semaphoreCmd() == amd::ExternalSemaphoreCmd::COMMAND_SIGNAL_EXTSEMAPHORE) {
|
||||
flushDMA(MainEngine);
|
||||
if (Pal::Result::Success !=
|
||||
queues_[MainEngine]->iQueue_->SignalQueueSemaphore(const_cast<Pal::IQueueSemaphore*>(sem),
|
||||
cmd.fence())) {
|
||||
if (Pal::Result::Success != queues_[MainEngine]->iQueue_->SignalQueueSemaphore(
|
||||
const_cast<Pal::IQueueSemaphore*>(sem), cmd.fence())) {
|
||||
LogError("Failed to signal external semaphore");
|
||||
}
|
||||
} else {
|
||||
if (Pal::Result::Success !=
|
||||
queues_[MainEngine]->iQueue_->WaitQueueSemaphore(const_cast<Pal::IQueueSemaphore*>(sem),
|
||||
cmd.fence())) {
|
||||
if (Pal::Result::Success != queues_[MainEngine]->iQueue_->WaitQueueSemaphore(
|
||||
const_cast<Pal::IQueueSemaphore*>(sem), cmd.fence())) {
|
||||
LogError("Failed to wait on external semaphore");
|
||||
}
|
||||
}
|
||||
@@ -3657,9 +3656,8 @@ bool VirtualGPU::processMemObjectsHSA(const amd::Kernel& kernel, const_address p
|
||||
//! Note: SVM with subbuffers has an issue with tracking.
|
||||
//! Conformance can send read only subbuffer, but update the region
|
||||
//! in the kernel.
|
||||
if ((mem != nullptr) &&
|
||||
((!info.readOnly_ && (mem->getSvmPtr() == nullptr)) ||
|
||||
((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0))) {
|
||||
if ((mem != nullptr) && ((!info.readOnly_ && (mem->getSvmPtr() == nullptr)) ||
|
||||
((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0))) {
|
||||
mem->signalWrite(&dev());
|
||||
}
|
||||
if (info.oclObject_ == amd::KernelParameterDescriptor::ImageObject) {
|
||||
|
||||
@@ -1709,8 +1709,8 @@ bool KernelBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
|
||||
} else {
|
||||
size_t totalSize = size[0];
|
||||
// Do a staging copy
|
||||
bool useShaderCopyPath = setup_.disableHwlCopyBuffer_ ||
|
||||
(totalSize <= dev().settings().sdmaCopyThreshold_) ||
|
||||
bool useShaderCopyPath =
|
||||
setup_.disableHwlCopyBuffer_ || (totalSize <= dev().settings().sdmaCopyThreshold_) ||
|
||||
(copyMetadata.copyEnginePreference_ == amd::CopyMetadata::CopyEnginePreference::BLIT);
|
||||
|
||||
if (!useShaderCopyPath) {
|
||||
@@ -1843,8 +1843,8 @@ bool KernelBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemo
|
||||
} else {
|
||||
size_t totalSize = size[0];
|
||||
// Do a staging copy
|
||||
bool useShaderCopyPath = setup_.disableHwlCopyBuffer_ ||
|
||||
(totalSize <= dev().settings().sdmaCopyThreshold_) ||
|
||||
bool useShaderCopyPath =
|
||||
setup_.disableHwlCopyBuffer_ || (totalSize <= dev().settings().sdmaCopyThreshold_) ||
|
||||
(copyMetadata.copyEnginePreference_ == amd::CopyMetadata::CopyEnginePreference::BLIT);
|
||||
|
||||
if (!useShaderCopyPath) {
|
||||
@@ -2014,18 +2014,18 @@ bool KernelBlitManager::fillBuffer1D(device::Memory& memory, const void* pattern
|
||||
for (auto& packed_obj : packed_vector) {
|
||||
constexpr uint32_t kFillType = FillBufferAligned;
|
||||
uint32_t kpattern_size = (packed_obj.pattern_expanded_)
|
||||
? HostBlitManager::FillBufferInfo::kExtendedSize
|
||||
: patternSize;
|
||||
? HostBlitManager::FillBufferInfo::kExtendedSize
|
||||
: patternSize;
|
||||
size_t kfill_size = packed_obj.fill_size_ / kpattern_size;
|
||||
size_t koffset = overall_offset;
|
||||
overall_offset += packed_obj.fill_size_;
|
||||
|
||||
size_t globalWorkOffset[3] = {0, 0, 0};
|
||||
uint32_t alignment = (kpattern_size & 0xf) == 0 ? 2 * sizeof(uint64_t)
|
||||
: (kpattern_size & 0x7) == 0 ? sizeof(uint64_t)
|
||||
: (kpattern_size & 0x3) == 0 ? sizeof(uint32_t)
|
||||
: (kpattern_size & 0x1) == 0 ? sizeof(uint16_t)
|
||||
: sizeof(uint8_t);
|
||||
uint32_t alignment = (kpattern_size & 0xf) == 0 ? 2 * sizeof(uint64_t)
|
||||
: (kpattern_size & 0x7) == 0 ? sizeof(uint64_t)
|
||||
: (kpattern_size & 0x3) == 0 ? sizeof(uint32_t)
|
||||
: (kpattern_size & 0x1) == 0 ? sizeof(uint16_t)
|
||||
: sizeof(uint8_t);
|
||||
// Program kernels arguments for the fill operation
|
||||
cl_mem mem = as_cl<amd::Memory>(memory.owner());
|
||||
setArgument(kernels_[kFillType], 0, sizeof(cl_mem), &mem, koffset);
|
||||
@@ -2096,10 +2096,10 @@ bool KernelBlitManager::fillBuffer2D(device::Memory& memory, const void* pattern
|
||||
size_t globalWorkSize[3] = {amd::alignUp(fillSizeX, 16), amd::alignUp(fillSizeY, 16), 1};
|
||||
size_t localWorkSize[3] = {16, 16, 1};
|
||||
|
||||
uint32_t alignment = (patternSize & 0x7) == 0 ? sizeof(uint64_t)
|
||||
: (patternSize & 0x3) == 0 ? sizeof(uint32_t)
|
||||
: (patternSize & 0x1) == 0 ? sizeof(uint16_t)
|
||||
: sizeof(uint8_t);
|
||||
uint32_t alignment = (patternSize & 0x7) == 0 ? sizeof(uint64_t)
|
||||
: (patternSize & 0x3) == 0 ? sizeof(uint32_t)
|
||||
: (patternSize & 0x1) == 0 ? sizeof(uint16_t)
|
||||
: sizeof(uint8_t);
|
||||
|
||||
cl_mem mem = as_cl<amd::Memory>(memory.owner());
|
||||
if (alignment == sizeof(uint64_t)) {
|
||||
@@ -2250,8 +2250,8 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
|
||||
|
||||
bool ipcShared = srcMemory.owner()->ipcShared() || dstMemory.owner()->ipcShared();
|
||||
|
||||
bool useShaderCopyPath = setup_.disableHwlCopyBuffer_ ||
|
||||
(sizeIn[0] <= dev().settings().sdmaCopyThreshold_) ||
|
||||
bool useShaderCopyPath =
|
||||
setup_.disableHwlCopyBuffer_ || (sizeIn[0] <= dev().settings().sdmaCopyThreshold_) ||
|
||||
(!(p2p || ipcShared) &&
|
||||
(!srcMemory.isHostMemDirectAccess() && !dstMemory.isHostMemDirectAccess() &&
|
||||
!(copyMetadata.copyEnginePreference_ ==
|
||||
@@ -2307,9 +2307,8 @@ bool KernelBlitManager::fillImage(device::Memory& memory, const void* pattern,
|
||||
constexpr size_t kFillImageThreshold = 256 * 256;
|
||||
|
||||
// Use host fill if memory has direct access and image is small
|
||||
if (setup_.disableFillImage_ ||
|
||||
(gpuMem(memory).isHostMemDirectAccess() &&
|
||||
(size.c[0] * size.c[1] * size.c[2]) <= kFillImageThreshold)) {
|
||||
if (setup_.disableFillImage_ || (gpuMem(memory).isHostMemDirectAccess() &&
|
||||
(size.c[0] * size.c[1] * size.c[2]) <= kFillImageThreshold)) {
|
||||
// Stall GPU before CPU access
|
||||
gpu().releaseGpuMemoryFence();
|
||||
result = HostBlitManager::fillImage(memory, pattern, origin, size, entire);
|
||||
@@ -2691,8 +2690,8 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, hsa_queue_t* schedulerQueue,
|
||||
|
||||
amd::NDRangeContainer ndrange(1, globalWorkOffset, globalWorkSize, localWorkSize);
|
||||
|
||||
device::Kernel* devKernel = const_cast<device::Kernel*>(
|
||||
kernels_[Scheduler]->getDeviceKernel(dev()));
|
||||
device::Kernel* devKernel =
|
||||
const_cast<device::Kernel*>(kernels_[Scheduler]->getDeviceKernel(dev()));
|
||||
|
||||
Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);
|
||||
|
||||
|
||||
@@ -376,8 +376,8 @@ hsa_ven_amd_loader_1_00_pfn_t Device::amd_loader_ext_table = {nullptr};
|
||||
|
||||
hsa_status_t Device::loaderQueryHostAddress(const void* device, const void** host) {
|
||||
return amd_loader_ext_table.hsa_ven_amd_loader_query_host_address
|
||||
? amd_loader_ext_table.hsa_ven_amd_loader_query_host_address(device, host)
|
||||
: HSA_STATUS_ERROR;
|
||||
? amd_loader_ext_table.hsa_ven_amd_loader_query_host_address(device, host)
|
||||
: HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
// ================================================================================================
|
||||
@@ -413,9 +413,9 @@ bool Device::init() {
|
||||
return false;
|
||||
}
|
||||
|
||||
std::string ordinals = amd::IS_HIP
|
||||
? ((HIP_VISIBLE_DEVICES[0] != '\0') ? HIP_VISIBLE_DEVICES : CUDA_VISIBLE_DEVICES)
|
||||
: GPU_DEVICE_ORDINAL;
|
||||
std::string ordinals =
|
||||
amd::IS_HIP ? ((HIP_VISIBLE_DEVICES[0] != '\0') ? HIP_VISIBLE_DEVICES : CUDA_VISIBLE_DEVICES)
|
||||
: GPU_DEVICE_ORDINAL;
|
||||
if (ordinals[0] != '\0') {
|
||||
size_t pos = 0;
|
||||
std::vector<hsa_agent_t> valid_agents;
|
||||
@@ -573,9 +573,9 @@ bool Device::create() {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (HSA_STATUS_SUCCESS !=
|
||||
hsa_agent_get_info(bkendDevice_, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CHIP_ID,
|
||||
&pciDeviceId_)) {
|
||||
if (HSA_STATUS_SUCCESS != hsa_agent_get_info(bkendDevice_,
|
||||
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_CHIP_ID,
|
||||
&pciDeviceId_)) {
|
||||
LogPrintfError("Unable to get PCI ID of HSA device %s", agent_name);
|
||||
return false;
|
||||
}
|
||||
@@ -584,35 +584,34 @@ bool Device::create() {
|
||||
uint count;
|
||||
hsa_isa_t first_isa;
|
||||
} agent_isas = {0, {0}};
|
||||
if (HSA_STATUS_SUCCESS !=
|
||||
hsa_agent_iterate_isas(
|
||||
bkendDevice_,
|
||||
[](hsa_isa_t isa, void* data) {
|
||||
agent_isas_t* agent_isas = static_cast<agent_isas_t*>(data);
|
||||
if (agent_isas->count++ == 0) {
|
||||
agent_isas->first_isa = isa;
|
||||
}
|
||||
return HSA_STATUS_SUCCESS;
|
||||
},
|
||||
&agent_isas)) {
|
||||
if (HSA_STATUS_SUCCESS != hsa_agent_iterate_isas(
|
||||
bkendDevice_,
|
||||
[](hsa_isa_t isa, void* data) {
|
||||
agent_isas_t* agent_isas = static_cast<agent_isas_t*>(data);
|
||||
if (agent_isas->count++ == 0) {
|
||||
agent_isas->first_isa = isa;
|
||||
}
|
||||
return HSA_STATUS_SUCCESS;
|
||||
},
|
||||
&agent_isas)) {
|
||||
LogPrintfError("Unable to iterate supported ISAs for HSA device %s (PCI ID %x)", agent_name,
|
||||
pciDeviceId_);
|
||||
return false;
|
||||
}
|
||||
|
||||
uint32_t isa_name_length = 0;
|
||||
if (HSA_STATUS_SUCCESS !=
|
||||
hsa_isa_get_info_alt(agent_isas.first_isa, (hsa_isa_info_t)HSA_ISA_INFO_NAME_LENGTH,
|
||||
&isa_name_length)) {
|
||||
if (HSA_STATUS_SUCCESS != hsa_isa_get_info_alt(agent_isas.first_isa,
|
||||
(hsa_isa_info_t)HSA_ISA_INFO_NAME_LENGTH,
|
||||
&isa_name_length)) {
|
||||
LogPrintfError("Unable to get ISA name length for HSA device %s (PCI ID %x)", agent_name,
|
||||
pciDeviceId_);
|
||||
return false;
|
||||
}
|
||||
|
||||
std::vector<char> isa_name(isa_name_length + 1, '\0');
|
||||
if (HSA_STATUS_SUCCESS !=
|
||||
hsa_isa_get_info_alt(agent_isas.first_isa, (hsa_isa_info_t)HSA_ISA_INFO_NAME,
|
||||
isa_name.data())) {
|
||||
if (HSA_STATUS_SUCCESS != hsa_isa_get_info_alt(agent_isas.first_isa,
|
||||
(hsa_isa_info_t)HSA_ISA_INFO_NAME,
|
||||
isa_name.data())) {
|
||||
LogPrintfError("Unable to get ISA name for HSA device %s (PCI ID %x)", agent_name,
|
||||
pciDeviceId_);
|
||||
return false;
|
||||
@@ -663,10 +662,9 @@ bool Device::create() {
|
||||
assert(!settings_);
|
||||
roc::Settings* hsaSettings = new roc::Settings();
|
||||
settings_ = hsaSettings;
|
||||
if (!hsaSettings ||
|
||||
!hsaSettings->create((agent_profile_ == HSA_PROFILE_FULL), *isa,
|
||||
isa->xnack() == amd::Isa::Feature::Enabled, coop_groups, isXgmi_,
|
||||
hasValidHDPFlush)) {
|
||||
if (!hsaSettings || !hsaSettings->create((agent_profile_ == HSA_PROFILE_FULL), *isa,
|
||||
isa->xnack() == amd::Isa::Feature::Enabled, coop_groups,
|
||||
isXgmi_, hasValidHDPFlush)) {
|
||||
LogPrintfError("Unable to create settings for HSA device %s (PCI ID %x)", agent_name,
|
||||
pciDeviceId_);
|
||||
return false;
|
||||
@@ -969,11 +967,11 @@ bool Device::createSampler(const amd::Sampler& owner, device::Sampler** sampler)
|
||||
void Sampler::fillSampleDescriptor(hsa_ext_sampler_descriptor_v2_t& samplerDescriptor,
|
||||
const amd::Sampler& sampler) const {
|
||||
samplerDescriptor.filter_mode = sampler.filterMode() == CL_FILTER_NEAREST
|
||||
? HSA_EXT_SAMPLER_FILTER_MODE_NEAREST
|
||||
: HSA_EXT_SAMPLER_FILTER_MODE_LINEAR;
|
||||
? HSA_EXT_SAMPLER_FILTER_MODE_NEAREST
|
||||
: HSA_EXT_SAMPLER_FILTER_MODE_LINEAR;
|
||||
samplerDescriptor.coordinate_mode = sampler.normalizedCoords()
|
||||
? HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED
|
||||
: HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED;
|
||||
? HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED
|
||||
: HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED;
|
||||
for (int i = 0; i < 3; i++) {
|
||||
switch (sampler.addressingMode(i)) {
|
||||
case CL_ADDRESS_CLAMP_TO_EDGE:
|
||||
@@ -1036,9 +1034,9 @@ bool Device::populateOCLDeviceConstants() {
|
||||
|
||||
::strncpy(info_.name_, isa().targetId(), sizeof(info_.name_) - 1);
|
||||
char device_name[64] = {0};
|
||||
if (HSA_STATUS_SUCCESS ==
|
||||
hsa_agent_get_info(bkendDevice_, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME,
|
||||
device_name)) {
|
||||
if (HSA_STATUS_SUCCESS == hsa_agent_get_info(bkendDevice_,
|
||||
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME,
|
||||
device_name)) {
|
||||
::strncpy(info_.boardName_, device_name, sizeof(info_.boardName_) - 1);
|
||||
}
|
||||
|
||||
@@ -1075,9 +1073,9 @@ bool Device::populateOCLDeviceConstants() {
|
||||
info_.maxPhysicalComputeUnits_ = settings().enableWgpMode_ ? info_.maxPhysicalComputeUnits_ / 2
|
||||
: info_.maxPhysicalComputeUnits_;
|
||||
|
||||
if (HSA_STATUS_SUCCESS !=
|
||||
hsa_agent_get_info(bkendDevice_, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE,
|
||||
&info_.globalMemCacheLineSize_)) {
|
||||
if (HSA_STATUS_SUCCESS != hsa_agent_get_info(bkendDevice_,
|
||||
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE,
|
||||
&info_.globalMemCacheLineSize_)) {
|
||||
return false;
|
||||
}
|
||||
info_.globalMemCacheLineSize_ =
|
||||
@@ -1152,9 +1150,8 @@ bool Device::populateOCLDeviceConstants() {
|
||||
checkAtomicSupport();
|
||||
|
||||
assert(cpu_agent_info_->fine_grain_pool.handle != 0);
|
||||
if (HSA_STATUS_SUCCESS !=
|
||||
hsa_amd_agent_iterate_memory_pools(bkendDevice_, Device::iterateGpuMemoryPoolCallback,
|
||||
this)) {
|
||||
if (HSA_STATUS_SUCCESS != hsa_amd_agent_iterate_memory_pools(
|
||||
bkendDevice_, Device::iterateGpuMemoryPoolCallback, this)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1188,9 +1185,9 @@ bool Device::populateOCLDeviceConstants() {
|
||||
}
|
||||
|
||||
size_t group_segment_size = 0;
|
||||
if (HSA_STATUS_SUCCESS !=
|
||||
hsa_amd_memory_pool_get_info(group_segment_, HSA_AMD_MEMORY_POOL_INFO_SIZE,
|
||||
&group_segment_size)) {
|
||||
if (HSA_STATUS_SUCCESS != hsa_amd_memory_pool_get_info(group_segment_,
|
||||
HSA_AMD_MEMORY_POOL_INFO_SIZE,
|
||||
&group_segment_size)) {
|
||||
return false;
|
||||
}
|
||||
assert(group_segment_size > 0);
|
||||
@@ -1229,16 +1226,16 @@ bool Device::populateOCLDeviceConstants() {
|
||||
|
||||
if (settings().enableLocalMemory_ && gpuvm_segment_.handle != 0) {
|
||||
size_t global_segment_size = 0;
|
||||
if (HSA_STATUS_SUCCESS !=
|
||||
hsa_amd_memory_pool_get_info(gpuvm_segment_, HSA_AMD_MEMORY_POOL_INFO_SIZE,
|
||||
&global_segment_size)) {
|
||||
if (HSA_STATUS_SUCCESS != hsa_amd_memory_pool_get_info(gpuvm_segment_,
|
||||
HSA_AMD_MEMORY_POOL_INFO_SIZE,
|
||||
&global_segment_size)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
assert(global_segment_size > 0);
|
||||
info_.globalMemSize_ = (static_cast<uint64_t>(std::min(GPU_MAX_HEAP_SIZE, 100u)) *
|
||||
static_cast<uint64_t>(global_segment_size)) /
|
||||
100u;
|
||||
100u;
|
||||
|
||||
// For APU with vram size <= 512MiB, use a smaller single alloc percentage
|
||||
if (info_.globalMemSize_ <= 536870912) {
|
||||
@@ -1266,7 +1263,7 @@ bool Device::populateOCLDeviceConstants() {
|
||||
info_.globalMemSize_ = std::max(info_.globalMemSize_, uint64_t(1 * Gi));
|
||||
info_.globalMemSize_ = (static_cast<uint64_t>(std::min(GPU_MAX_HEAP_SIZE, 100u)) *
|
||||
static_cast<uint64_t>(info_.globalMemSize_)) /
|
||||
100u;
|
||||
100u;
|
||||
|
||||
info_.maxMemAllocSize_ =
|
||||
uint64_t(info_.globalMemSize_ * std::min(GPU_SINGLE_ALLOC_PERCENT, 100u) / 100u);
|
||||
@@ -1325,8 +1322,8 @@ bool Device::populateOCLDeviceConstants() {
|
||||
info_.hostUnifiedMemory_ = 1;
|
||||
info_.iommuv2_ = true;
|
||||
}
|
||||
info_.memBaseAddrAlign_ = 8 *
|
||||
(flagIsDefault(MEMOBJ_BASE_ADDR_ALIGN) ? sizeof(int64_t[16]) * 2 : MEMOBJ_BASE_ADDR_ALIGN);
|
||||
info_.memBaseAddrAlign_ = 8 * (flagIsDefault(MEMOBJ_BASE_ADDR_ALIGN) ? sizeof(int64_t[16]) * 2
|
||||
: MEMOBJ_BASE_ADDR_ALIGN);
|
||||
info_.minDataTypeAlignSize_ = sizeof(int64_t[16]);
|
||||
|
||||
info_.maxConstantArgs_ = 8;
|
||||
@@ -1629,14 +1626,14 @@ bool Device::populateOCLDeviceConstants() {
|
||||
if (getIsaMeta(std::move(isa().isaName()), isaMeta)) {
|
||||
std::string addressableNumVGPRs, totalNumVGPRs, vGPRAllocGranule;
|
||||
info_.availableVGPRs_ = getValueFromIsaMeta(isaMeta, "AddressableNumVGPRs", addressableNumVGPRs)
|
||||
? atoi(addressableNumVGPRs.c_str())
|
||||
: 0;
|
||||
? atoi(addressableNumVGPRs.c_str())
|
||||
: 0;
|
||||
info_.vgprsPerSimd_ = getValueFromIsaMeta(isaMeta, "TotalNumVGPRs", totalNumVGPRs)
|
||||
? atoi(totalNumVGPRs.c_str())
|
||||
: 0;
|
||||
? atoi(totalNumVGPRs.c_str())
|
||||
: 0;
|
||||
info_.vgprAllocGranularity_ = getValueFromIsaMeta(isaMeta, "VGPRAllocGranule", vGPRAllocGranule)
|
||||
? atoi(vGPRAllocGranule.c_str())
|
||||
: 0;
|
||||
? atoi(vGPRAllocGranule.c_str())
|
||||
: 0;
|
||||
|
||||
info_.availableRegistersPerCU_ = info_.vgprsPerSimd_ * info_.simdPerCU_ * info_.wavefrontWidth_;
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_INIT,
|
||||
@@ -1647,8 +1644,8 @@ bool Device::populateOCLDeviceConstants() {
|
||||
|
||||
std::string sgprValue;
|
||||
info_.availableSGPRs_ = (getValueFromIsaMeta(isaMeta, "AddressableNumSGPRs", sgprValue))
|
||||
? (atoi(sgprValue.c_str()))
|
||||
: 0;
|
||||
? (atoi(sgprValue.c_str()))
|
||||
: 0;
|
||||
if (!releaseIsaMeta(isaMeta)) {
|
||||
LogInfo("Can not release the isa meta node");
|
||||
}
|
||||
@@ -1663,9 +1660,8 @@ bool Device::populateOCLDeviceConstants() {
|
||||
}
|
||||
|
||||
// This capability should be available with xnack enabled
|
||||
if (HSA_STATUS_SUCCESS !=
|
||||
hsa_system_get_info(HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT,
|
||||
&info_.hmmCpuMemoryAccessible_)) {
|
||||
if (HSA_STATUS_SUCCESS != hsa_system_get_info(HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT,
|
||||
&info_.hmmCpuMemoryAccessible_)) {
|
||||
LogError("HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT query failed.");
|
||||
}
|
||||
|
||||
@@ -1805,9 +1801,9 @@ bool Device::bindExternalDevice(uint flags, void* const gfxDevice[], void* gfxCo
|
||||
}
|
||||
|
||||
return info_.deviceTopology_.pcie.bus == info.pci_bus &&
|
||||
info_.deviceTopology_.pcie.device == info.pci_device &&
|
||||
info_.deviceTopology_.pcie.function == info.pci_function &&
|
||||
info_.vendorId_ == info.vendor_id && pciDeviceId_ == info.device_id;
|
||||
info_.deviceTopology_.pcie.device == info.pci_device &&
|
||||
info_.deviceTopology_.pcie.function == info.pci_function &&
|
||||
info_.vendorId_ == info.vendor_id && pciDeviceId_ == info.device_id;
|
||||
|
||||
#endif
|
||||
}
|
||||
@@ -2224,10 +2220,10 @@ void Device::releaseMemory(void* ptr, size_t size) const {
|
||||
|
||||
void* Device::deviceLocalAlloc(size_t size, bool atomics, bool pseudo_fine_grain,
|
||||
bool contiguous) const {
|
||||
const hsa_amd_memory_pool_t& pool = (pseudo_fine_grain && gpu_ext_fine_grained_segment_.handle)
|
||||
? gpu_ext_fine_grained_segment_
|
||||
: (atomics && gpu_fine_grained_segment_.handle) ? gpu_fine_grained_segment_
|
||||
: gpuvm_segment_;
|
||||
const hsa_amd_memory_pool_t& pool =
|
||||
(pseudo_fine_grain && gpu_ext_fine_grained_segment_.handle) ? gpu_ext_fine_grained_segment_
|
||||
: (atomics && gpu_fine_grained_segment_.handle) ? gpu_fine_grained_segment_
|
||||
: gpuvm_segment_;
|
||||
|
||||
if (pool.handle == 0 || gpuvm_segment_max_alloc_ == 0) {
|
||||
DevLogPrintfError("Invalid argument, pool_handle: 0x%x , max_alloc: %u \n", pool.handle,
|
||||
@@ -2474,9 +2470,8 @@ bool Device::SetSvmAttributesInt(const void* dev_ptr, size_t count, amd::MemoryA
|
||||
amd::Memory* svm_mem = amd::MemObjMap::FindMemObj(dev_ptr);
|
||||
if ((nullptr == svm_mem) || ((svm_mem->getMemFlags() & CL_MEM_ALLOC_HOST_PTR) == 0) ||
|
||||
// Validate the range of provided memory
|
||||
((svm_mem->getSize() -
|
||||
(reinterpret_cast<const_address>(dev_ptr) -
|
||||
reinterpret_cast<address>(svm_mem->getSvmPtr()))) < count)) {
|
||||
((svm_mem->getSize() - (reinterpret_cast<const_address>(dev_ptr) -
|
||||
reinterpret_cast<address>(svm_mem->getSvmPtr()))) < count)) {
|
||||
LogPrintfError("SetSvmAttributes received unknown memory for update: %p!", dev_ptr);
|
||||
return false;
|
||||
}
|
||||
@@ -2565,9 +2560,8 @@ bool Device::GetSvmAttributes(void** data, size_t* data_sizes, int* attributes,
|
||||
amd::Memory* svm_mem = amd::MemObjMap::FindMemObj(dev_ptr);
|
||||
if ((nullptr == svm_mem) || ((svm_mem->getMemFlags() & CL_MEM_ALLOC_HOST_PTR) == 0) ||
|
||||
// Validate the range of provided memory
|
||||
((svm_mem->getSize() -
|
||||
(reinterpret_cast<const_address>(dev_ptr) -
|
||||
reinterpret_cast<address>(svm_mem->getSvmPtr()))) < count)) {
|
||||
((svm_mem->getSize() - (reinterpret_cast<const_address>(dev_ptr) -
|
||||
reinterpret_cast<address>(svm_mem->getSvmPtr()))) < count)) {
|
||||
LogPrintfError("GetSvmAttributes received unknown memory %p for state!", dev_ptr);
|
||||
return false;
|
||||
}
|
||||
@@ -3493,9 +3487,8 @@ bool Device::IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer
|
||||
}
|
||||
|
||||
if (ptr_info->type != HSA_EXT_POINTER_TYPE_UNKNOWN) {
|
||||
if ((size != 0) &&
|
||||
((reinterpret_cast<const_address>(dev_ptr) -
|
||||
reinterpret_cast<const_address>(ptr_info->agentBaseAddress)) > size)) {
|
||||
if ((size != 0) && ((reinterpret_cast<const_address>(dev_ptr) -
|
||||
reinterpret_cast<const_address>(ptr_info->agentBaseAddress)) > size)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
||||
@@ -835,9 +835,8 @@ bool Buffer::create(bool alloc_local) {
|
||||
} else if (memFlags & ROCCLR_MEM_HSA_SIGNAL_MEMORY) {
|
||||
// TODO: ROCr will introduce a new attribute enum that implies a non-blocking signal,
|
||||
// replace "HSA_AMD_SIGNAL_AMD_GPU_ONLY" with this new enum when it is ready.
|
||||
if (HSA_STATUS_SUCCESS !=
|
||||
hsa_amd_signal_create(kInitSignalValueOne, 0, nullptr, HSA_AMD_SIGNAL_AMD_GPU_ONLY,
|
||||
&signal_)) {
|
||||
if (HSA_STATUS_SUCCESS != hsa_amd_signal_create(kInitSignalValueOne, 0, nullptr,
|
||||
HSA_AMD_SIGNAL_AMD_GPU_ONLY, &signal_)) {
|
||||
ClPrint(amd::LOG_ERROR, amd::LOG_MEM,
|
||||
"[ROCclr] ROCCLR_MEM_HSA_SIGNAL_MEMORY signal creation failed");
|
||||
return false;
|
||||
@@ -1316,8 +1315,8 @@ bool Image::create(bool alloc_local) {
|
||||
// support alignment larger than HSA memory region allocation granularity.
|
||||
// In this case, the user manages the alignment.
|
||||
const size_t alloc_size = (deviceImageInfo_.alignment <= dev().alloc_granularity())
|
||||
? deviceImageInfo_.size
|
||||
: deviceImageInfo_.size + deviceImageInfo_.alignment;
|
||||
? deviceImageInfo_.size
|
||||
: deviceImageInfo_.size + deviceImageInfo_.alignment;
|
||||
|
||||
if (!(owner()->getMemFlags() & CL_MEM_ALLOC_HOST_PTR)) {
|
||||
originalDeviceMemory_ = dev().deviceLocalAlloc(alloc_size);
|
||||
@@ -1357,8 +1356,8 @@ bool Image::createView(const Memory& parent) {
|
||||
deviceMemory_ = parent.getDeviceMemory();
|
||||
|
||||
originalDeviceMemory_ = (parent.owner()->asBuffer() != nullptr)
|
||||
? deviceMemory_
|
||||
: static_cast<const Image&>(parent).originalDeviceMemory_;
|
||||
? deviceMemory_
|
||||
: static_cast<const Image&>(parent).originalDeviceMemory_;
|
||||
|
||||
// Detect image view from buffer to distinguish linear paths from tiled.
|
||||
amd::Memory* ancestor = parent.owner();
|
||||
@@ -1411,10 +1410,10 @@ bool Image::createView(const Memory& parent) {
|
||||
break;
|
||||
}
|
||||
hsa_ext_image_t hsaImage;
|
||||
if (HSA_STATUS_SUCCESS ==
|
||||
hsa_ext_image_create_with_layout(
|
||||
dev().getBackendDevice(), &imageDescriptor_, deviceMemory_, permission_,
|
||||
HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR, tryPitch, 0, &hsaImage)) {
|
||||
if (HSA_STATUS_SUCCESS == hsa_ext_image_create_with_layout(
|
||||
dev().getBackendDevice(), &imageDescriptor_, deviceMemory_,
|
||||
permission_, HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR, tryPitch, 0,
|
||||
&hsaImage)) {
|
||||
// The image pitch from app is not expectation of the GPU
|
||||
LogWarning("[OCL] will use copy image");
|
||||
workaround = true;
|
||||
|
||||
@@ -153,10 +153,10 @@ class Memory : public device::Memory {
|
||||
|
||||
// Get MemorySegment type in terms of host memory allocation flags
|
||||
Device::MemorySegment getHostMemorySegment(const unsigned int memFlags) {
|
||||
return (memFlags & CL_MEM_SVM_ATOMICS) == 0
|
||||
? Device::MemorySegment::kNoAtomics
|
||||
: ((memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0 ? Device::MemorySegment::kUncachedAtomics
|
||||
: Device::MemorySegment::kAtomics);
|
||||
return (memFlags & CL_MEM_SVM_ATOMICS) == 0 ? Device::MemorySegment::kNoAtomics
|
||||
: ((memFlags & ROCCLR_MEM_HSA_UNCACHED) != 0
|
||||
? Device::MemorySegment::kUncachedAtomics
|
||||
: Device::MemorySegment::kAtomics);
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
@@ -177,8 +177,8 @@ size_t PrintfDbg::outputArgument(const std::string& fmt, bool printFloat, size_t
|
||||
case 4:
|
||||
if (printFloat) {
|
||||
const float fArg = size == 2
|
||||
? amd::half2float(*(reinterpret_cast<const uint16_t*>(argument)))
|
||||
: *(reinterpret_cast<const float*>(argument));
|
||||
? amd::half2float(*(reinterpret_cast<const uint16_t*>(argument)))
|
||||
: *(reinterpret_cast<const float*>(argument));
|
||||
static const char* fSpecifiers = "eEfgGa";
|
||||
std::string fmtF = fmt;
|
||||
size_t posS = fmtF.find_first_of("%");
|
||||
@@ -216,13 +216,12 @@ size_t PrintfDbg::outputArgument(const std::string& fmt, bool printFloat, size_t
|
||||
hhFmt.erase(hhFmt.find_first_of("h"), 2);
|
||||
amd::Os::printf(hhFmt.data(), *(reinterpret_cast<const unsigned char*>(argument)));
|
||||
} else if (hlModifier) {
|
||||
amd::Os::printf(hlFmt.data(),
|
||||
size == 2 ? *(reinterpret_cast<const uint16_t*>(argument))
|
||||
: *(reinterpret_cast<const uint32_t*>(argument)));
|
||||
amd::Os::printf(hlFmt.data(), size == 2
|
||||
? *(reinterpret_cast<const uint16_t*>(argument))
|
||||
: *(reinterpret_cast<const uint32_t*>(argument)));
|
||||
} else {
|
||||
amd::Os::printf(fmt.data(),
|
||||
size == 2 ? *(reinterpret_cast<const uint16_t*>(argument))
|
||||
: *(reinterpret_cast<const uint32_t*>(argument)));
|
||||
amd::Os::printf(fmt.data(), size == 2 ? *(reinterpret_cast<const uint16_t*>(argument))
|
||||
: *(reinterpret_cast<const uint32_t*>(argument)));
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
@@ -57,8 +57,8 @@ struct AmdAqlWrap {
|
||||
// Its incremented on the
|
||||
// start and decremented on the finish. The parent kernel can be
|
||||
// considered as done when the value is 0 and the state is DONE
|
||||
uint64_t completion; //!< [LWO/SRO] CL event for the current execution (clk_event_t)
|
||||
uint64_t parent_wrap; //!< [LWO/SRO] Pointer to the parent AQL wrapper (AmdAqlWrap*)
|
||||
uint64_t completion; //!< [LWO/SRO] CL event for the current execution (clk_event_t)
|
||||
uint64_t parent_wrap; //!< [LWO/SRO] Pointer to the parent AQL wrapper (AmdAqlWrap*)
|
||||
uint64_t wait_list; //!< [LRO/SRO] Pointer to an array of clk_event_t objects (64 bytes default)
|
||||
uint32_t wait_num; //!< [LWO/SRO] The number of cl_event_wait objects
|
||||
uint32_t reserved[5]; //!< For the future usage
|
||||
|
||||
@@ -240,7 +240,7 @@ void Settings::setKernelArgImpl(const amd::Isa& isa, bool isXgmi, bool hasValidH
|
||||
const uint32_t gfxStepping = isa.versionStepping();
|
||||
|
||||
const bool isGfx94x = gfxipMajor == 9 && gfxipMinor >= 4 &&
|
||||
(gfxStepping == 0 || gfxStepping == 1 || gfxStepping == 2);
|
||||
(gfxStepping == 0 || gfxStepping == 1 || gfxStepping == 2);
|
||||
const bool isGfx90a = (gfxipMajor == 9 && gfxipMinor == 0 && gfxStepping == 10);
|
||||
const bool isPreGfx908 =
|
||||
(gfxipMajor < 9) || ((gfxipMajor == 9) && (gfxipMinor == 0) && (gfxStepping < 8));
|
||||
|
||||
@@ -879,11 +879,11 @@ bool VirtualGPU::processMemObjects(const amd::Kernel& kernel, const_address para
|
||||
} else {
|
||||
ClPrint(amd::LOG_INFO, amd::LOG_KERN, "Arg%d: %s %s = val:0x%lx (size:0x%x)", i,
|
||||
desc.typeName_.c_str(), desc.name_.c_str(),
|
||||
(desc.size_ == 1) ? *reinterpret_cast<const uint8_t*>(srcArgPtr)
|
||||
: (desc.size_ == 2) ? *reinterpret_cast<const uint16_t*>(srcArgPtr)
|
||||
: (desc.size_ == 4) ? *reinterpret_cast<const uint32_t*>(srcArgPtr)
|
||||
: (desc.size_ == 8) ? *reinterpret_cast<const uint64_t*>(srcArgPtr)
|
||||
: 0LL,
|
||||
(desc.size_ == 1) ? *reinterpret_cast<const uint8_t*>(srcArgPtr)
|
||||
: (desc.size_ == 2) ? *reinterpret_cast<const uint16_t*>(srcArgPtr)
|
||||
: (desc.size_ == 4) ? *reinterpret_cast<const uint32_t*>(srcArgPtr)
|
||||
: (desc.size_ == 8) ? *reinterpret_cast<const uint64_t*>(srcArgPtr)
|
||||
: 0LL,
|
||||
desc.size_);
|
||||
}
|
||||
}
|
||||
@@ -1362,10 +1362,10 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD
|
||||
HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
|
||||
cache_state, barrier_value_packet_.signal, barrier_value_packet_.value,
|
||||
barrier_value_packet_.mask,
|
||||
barrier_value_packet_.cond == 0 ? "EQ"
|
||||
: barrier_value_packet_.cond == 1 ? "NE"
|
||||
: barrier_value_packet_.cond == 2 ? "LT"
|
||||
: "GTE",
|
||||
barrier_value_packet_.cond == 0 ? "EQ"
|
||||
: barrier_value_packet_.cond == 1 ? "NE"
|
||||
: barrier_value_packet_.cond == 2 ? "LT"
|
||||
: "GTE",
|
||||
barrier_value_packet_.completion_signal, read, index);
|
||||
// Clear dependent signals for the next packet
|
||||
barrier_value_packet_.signal = hsa_signal_t{};
|
||||
@@ -1432,21 +1432,23 @@ VirtualGPU::VirtualGPU(Device& device, bool profiling, bool cooperative,
|
||||
cooperative_ = cooperative;
|
||||
|
||||
if (device.settings().fenceScopeAgent_) {
|
||||
dispatchPacketHeaderNoSync_ = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
|
||||
dispatchPacketHeaderNoSync_ =
|
||||
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
|
||||
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
|
||||
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
|
||||
dispatchPacketHeader_ = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
|
||||
(1 << HSA_PACKET_HEADER_BARRIER) |
|
||||
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
|
||||
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
|
||||
(1 << HSA_PACKET_HEADER_BARRIER) |
|
||||
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
|
||||
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
|
||||
} else {
|
||||
dispatchPacketHeaderNoSync_ = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
|
||||
dispatchPacketHeaderNoSync_ =
|
||||
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
|
||||
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
|
||||
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
|
||||
dispatchPacketHeader_ = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
|
||||
(1 << HSA_PACKET_HEADER_BARRIER) |
|
||||
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
|
||||
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
|
||||
(1 << HSA_PACKET_HEADER_BARRIER) |
|
||||
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
|
||||
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
|
||||
}
|
||||
|
||||
aqlHeader_ = dispatchPacketHeader_;
|
||||
@@ -2091,8 +2093,8 @@ void VirtualGPU::submitSvmPrefetchAsync(amd::SvmPrefetchAsyncCommand& cmd) {
|
||||
// Find the requested agent for the transfer
|
||||
hsa_agent_t agent =
|
||||
(cmd.cpu_access() || (dev().settings().hmmFlags_ & Settings::Hmm::EnableSystemMemory))
|
||||
? dev().getCpuAgent(cmd.numa_id())
|
||||
: (static_cast<const roc::Device*>(cmd.device()))->getBackendDevice();
|
||||
? dev().getCpuAgent(cmd.numa_id())
|
||||
: (static_cast<const roc::Device*>(cmd.device()))->getBackendDevice();
|
||||
|
||||
// Initiate a prefetch command
|
||||
hsa_status_t status =
|
||||
@@ -3000,7 +3002,7 @@ void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) {
|
||||
size_t fillSize = patternSize * cmd.times();
|
||||
|
||||
size_t offset = reinterpret_cast<uintptr_t>(cmd.dst()) -
|
||||
reinterpret_cast<uintptr_t>(dstMemory->getSvmPtr());
|
||||
reinterpret_cast<uintptr_t>(dstMemory->getSvmPtr());
|
||||
|
||||
Memory* memory = dev().getRocMemory(dstMemory);
|
||||
|
||||
@@ -3567,9 +3569,9 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const
|
||||
if (aql_packet != nullptr) {
|
||||
*aql_packet = dispatchPacket;
|
||||
aql_packet->header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
|
||||
(1 << HSA_PACKET_HEADER_BARRIER) |
|
||||
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
|
||||
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
|
||||
(1 << HSA_PACKET_HEADER_BARRIER) |
|
||||
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
|
||||
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
|
||||
aql_packet->setup = sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
|
||||
}
|
||||
|
||||
|
||||
@@ -475,9 +475,9 @@ class VirtualGPU : public device::VirtualDevice {
|
||||
const uint8_t* aqlPacket = nullptr, bool attach_signal = false);
|
||||
bool dispatchAqlPacket(hsa_barrier_and_packet_t* packet, uint16_t header, uint16_t rest,
|
||||
bool blocking = true, bool attach_signal = false);
|
||||
template <typename AqlPacket>
|
||||
bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header, uint16_t rest, bool blocking,
|
||||
bool attach_signal = false);
|
||||
template <typename AqlPacket> bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header,
|
||||
uint16_t rest, bool blocking,
|
||||
bool attach_signal = false);
|
||||
|
||||
bool dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet, const uint32_t gfxVersion,
|
||||
bool blocking, const hsa_ven_amd_aqlprofile_1_00_pfn_t* extApi);
|
||||
|
||||
@@ -384,10 +384,10 @@ class elfio {
|
||||
bool is_sect_in_seg(Elf64_Off sect_begin, Elf_Xword sect_size, Elf64_Off seg_begin,
|
||||
Elf64_Off seg_end) {
|
||||
return seg_begin <= sect_begin && sect_begin + sect_size <= seg_end &&
|
||||
sect_begin <
|
||||
seg_end; // this is important criteria when sect_size == 0
|
||||
// Example: seg_begin=10, seg_end=12 (-> covering the bytes 10 and 11)
|
||||
// sect_begin=12, sect_size=0 -> shall return false!
|
||||
sect_begin <
|
||||
seg_end; // this is important criteria when sect_size == 0
|
||||
// Example: seg_begin=10, seg_end=12 (-> covering the bytes 10 and 11)
|
||||
// sect_begin=12, sect_size=0 -> shall return false!
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
@@ -447,7 +447,7 @@ class elfio {
|
||||
section* sec = sections_.at(i);
|
||||
|
||||
std::streampos headerPosition = (std::streamoff)header->get_sections_offset() +
|
||||
header->get_section_entry_size() * sec->get_index();
|
||||
header->get_section_entry_size() * sec->get_index();
|
||||
|
||||
sec->save(stream, headerPosition, sec->get_offset());
|
||||
}
|
||||
|
||||
@@ -130,7 +130,7 @@ template <class S> class note_section_accessor_template {
|
||||
Elf_Word descsz = convertor(*(const Elf_Word*)(data + current + sizeof(namesz)));
|
||||
|
||||
current += 3 * sizeof(Elf_Word) + ((namesz + align - 1) / align) * align +
|
||||
((descsz + align - 1) / align) * align;
|
||||
((descsz + align - 1) / align) * align;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -104,8 +104,8 @@ template <class S> class relocation_section_accessor_template {
|
||||
unsigned char other;
|
||||
|
||||
symbol_section_accessor symbols(elf_file, elf_file.sections[get_symbol_table_index()]);
|
||||
ret = ret &&
|
||||
symbols.get_symbol(symbol, symbolName, symbolValue, size, bind, symbolType, section, other);
|
||||
ret = ret && symbols.get_symbol(symbol, symbolName, symbolValue, size, bind, symbolType,
|
||||
section, other);
|
||||
|
||||
if (ret) { // Was it successful?
|
||||
switch (type) {
|
||||
@@ -207,9 +207,9 @@ template <class S> class relocation_section_accessor_template {
|
||||
Elf_Half get_symbol_table_index() const { return (Elf_Half)relocation_section->get_link(); }
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
template <class T>
|
||||
void generic_get_entry_rel(Elf_Xword index, Elf64_Addr& offset, Elf_Word& symbol, Elf_Word& type,
|
||||
Elf_Sxword& addend) const {
|
||||
template <class T> void generic_get_entry_rel(Elf_Xword index, Elf64_Addr& offset,
|
||||
Elf_Word& symbol, Elf_Word& type,
|
||||
Elf_Sxword& addend) const {
|
||||
const endianess_convertor& convertor = elf_file.get_convertor();
|
||||
|
||||
const T* pEntry = reinterpret_cast<const T*>(relocation_section->get_data() +
|
||||
@@ -222,9 +222,9 @@ template <class S> class relocation_section_accessor_template {
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
template <class T>
|
||||
void generic_get_entry_rela(Elf_Xword index, Elf64_Addr& offset, Elf_Word& symbol, Elf_Word& type,
|
||||
Elf_Sxword& addend) const {
|
||||
template <class T> void generic_get_entry_rela(Elf_Xword index, Elf64_Addr& offset,
|
||||
Elf_Word& symbol, Elf_Word& type,
|
||||
Elf_Sxword& addend) const {
|
||||
const endianess_convertor& convertor = elf_file.get_convertor();
|
||||
|
||||
const T* pEntry = reinterpret_cast<const T*>(relocation_section->get_data() +
|
||||
|
||||
@@ -255,10 +255,10 @@ template <class S> class symbol_section_accessor_template {
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
template <class T>
|
||||
bool generic_get_symbol(Elf_Xword index, std::string& name, Elf64_Addr& value, Elf_Xword& size,
|
||||
unsigned char& bind, unsigned char& type, Elf_Half& section_index,
|
||||
unsigned char& other) const {
|
||||
template <class T> bool generic_get_symbol(Elf_Xword index, std::string& name, Elf64_Addr& value,
|
||||
Elf_Xword& size, unsigned char& bind,
|
||||
unsigned char& type, Elf_Half& section_index,
|
||||
unsigned char& other) const {
|
||||
bool ret = false;
|
||||
|
||||
if (0 != symbol_section->get_data() && index < get_symbols_num()) {
|
||||
@@ -287,9 +287,9 @@ template <class S> class symbol_section_accessor_template {
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
template <class T>
|
||||
Elf_Word generic_add_symbol(Elf_Word name, Elf64_Addr value, Elf_Xword size, unsigned char info,
|
||||
unsigned char other, Elf_Half shndx) {
|
||||
template <class T> Elf_Word generic_add_symbol(Elf_Word name, Elf64_Addr value, Elf_Xword size,
|
||||
unsigned char info, unsigned char other,
|
||||
Elf_Half shndx) {
|
||||
const endianess_convertor& convertor = elf_file.get_convertor();
|
||||
|
||||
T entry;
|
||||
|
||||
@@ -66,9 +66,9 @@ class endianess_convertor {
|
||||
return value;
|
||||
}
|
||||
value = ((value & 0x00000000000000FFull) << 56) | ((value & 0x000000000000FF00ull) << 40) |
|
||||
((value & 0x0000000000FF0000ull) << 24) | ((value & 0x00000000FF000000ull) << 8) |
|
||||
((value & 0x000000FF00000000ull) >> 8) | ((value & 0x0000FF0000000000ull) >> 24) |
|
||||
((value & 0x00FF000000000000ull) >> 40) | ((value & 0xFF00000000000000ull) >> 56);
|
||||
((value & 0x0000000000FF0000ull) << 24) | ((value & 0x00000000FF000000ull) << 8) |
|
||||
((value & 0x000000FF00000000ull) >> 8) | ((value & 0x0000FF0000000000ull) >> 24) |
|
||||
((value & 0x00FF000000000000ull) >> 40) | ((value & 0xFF00000000000000ull) >> 56);
|
||||
|
||||
return value;
|
||||
}
|
||||
@@ -87,7 +87,7 @@ class endianess_convertor {
|
||||
return value;
|
||||
}
|
||||
value = ((value & 0x000000FF) << 24) | ((value & 0x0000FF00) << 8) |
|
||||
((value & 0x00FF0000) >> 8) | ((value & 0xFF000000) >> 24);
|
||||
((value & 0x00FF0000) >> 8) | ((value & 0xFF000000) >> 24);
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
@@ -471,7 +471,7 @@ inline void Os::ThreadAffinityMask::clear(uint cpu) {
|
||||
|
||||
inline bool Os::ThreadAffinityMask::isSet(uint cpu) const {
|
||||
return (KAFFINITY)0 !=
|
||||
(mask_[cpu / (8 * sizeof(KAFFINITY))] & ((KAFFINITY)1 << (cpu % (8 * sizeof(KAFFINITY)))));
|
||||
(mask_[cpu / (8 * sizeof(KAFFINITY))] & ((KAFFINITY)1 << (cpu % (8 * sizeof(KAFFINITY)))));
|
||||
}
|
||||
|
||||
inline bool Os::ThreadAffinityMask::isEmpty() const {
|
||||
|
||||
@@ -301,10 +301,9 @@ const Event::EventWaitList Event::nullWaitList(0);
|
||||
// ================================================================================================
|
||||
Command::Command(HostQueue& queue, cl_command_type type, const EventWaitList& eventWaitList,
|
||||
uint32_t commandWaitBits, const Event* waitingEvent)
|
||||
: Event(queue,
|
||||
amd::activity_prof::IsEnabled(amd::activity_prof::OperationId(type)) ||
|
||||
queue.properties().test(CL_QUEUE_PROFILING_ENABLE) ||
|
||||
Agent::shouldPostEventEvents()),
|
||||
: Event(queue, amd::activity_prof::IsEnabled(amd::activity_prof::OperationId(type)) ||
|
||||
queue.properties().test(CL_QUEUE_PROFILING_ENABLE) ||
|
||||
Agent::shouldPostEventEvents()),
|
||||
queue_(&queue),
|
||||
next_(nullptr),
|
||||
type_(type),
|
||||
@@ -604,24 +603,24 @@ bool CopyMemoryCommand::isEntireMemory() const {
|
||||
Coord3D imageSize(size()[0] * size()[1] * size()[2] *
|
||||
source().asImage()->getImageFormat().getElementSize());
|
||||
result = source().isEntirelyCovered(srcOrigin(), size()) &&
|
||||
destination().isEntirelyCovered(dstOrigin(), imageSize);
|
||||
destination().isEntirelyCovered(dstOrigin(), imageSize);
|
||||
} break;
|
||||
case CL_COMMAND_COPY_BUFFER_TO_IMAGE: {
|
||||
Coord3D imageSize(size()[0] * size()[1] * size()[2] *
|
||||
destination().asImage()->getImageFormat().getElementSize());
|
||||
result = source().isEntirelyCovered(srcOrigin(), imageSize) &&
|
||||
destination().isEntirelyCovered(dstOrigin(), size());
|
||||
destination().isEntirelyCovered(dstOrigin(), size());
|
||||
} break;
|
||||
case CL_COMMAND_COPY_BUFFER_RECT: {
|
||||
Coord3D rectSize(size()[0] * size()[1] * size()[2]);
|
||||
Coord3D srcOffs(srcRect().start_);
|
||||
Coord3D dstOffs(dstRect().start_);
|
||||
result = source().isEntirelyCovered(srcOffs, rectSize) &&
|
||||
destination().isEntirelyCovered(dstOffs, rectSize);
|
||||
destination().isEntirelyCovered(dstOffs, rectSize);
|
||||
} break;
|
||||
default:
|
||||
result = source().isEntirelyCovered(srcOrigin(), size()) &&
|
||||
destination().isEntirelyCovered(dstOrigin(), size());
|
||||
destination().isEntirelyCovered(dstOrigin(), size());
|
||||
break;
|
||||
}
|
||||
return result;
|
||||
|
||||
@@ -260,9 +260,8 @@ int Context::create(const intptr_t* properties) {
|
||||
}
|
||||
|
||||
// Check if OCL context can be associated with any external device
|
||||
if (info_.flags_ &
|
||||
(D3D10DeviceKhr | D3D11DeviceKhr | GLDeviceKhr | D3D9DeviceKhr | D3D9DeviceEXKhr |
|
||||
D3D9DeviceVAKhr)) {
|
||||
if (info_.flags_ & (D3D10DeviceKhr | D3D11DeviceKhr | GLDeviceKhr | D3D9DeviceKhr |
|
||||
D3D9DeviceEXKhr | D3D9DeviceVAKhr)) {
|
||||
// Loop through all devices
|
||||
for (const auto& it : devices_) {
|
||||
if (!it->bindExternalDevice(info_.flags_, info_.hDev_, info_.hCtx_, VALIDATE_ONLY)) {
|
||||
|
||||
@@ -75,10 +75,10 @@ size_t KernelParameters::localMemSize(size_t minDataTypeAlignment) const {
|
||||
if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
|
||||
if (desc.size_ == 8) {
|
||||
memSize = alignUp(memSize, minDataTypeAlignment) +
|
||||
*reinterpret_cast<const uint64_t*>(values_ + desc.offset_);
|
||||
*reinterpret_cast<const uint64_t*>(values_ + desc.offset_);
|
||||
} else {
|
||||
memSize = alignUp(memSize, minDataTypeAlignment) +
|
||||
*reinterpret_cast<const uint32_t*>(values_ + desc.offset_);
|
||||
*reinterpret_cast<const uint32_t*>(values_ + desc.offset_);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -300,10 +300,10 @@ address KernelParameters::capture(device::VirtualDevice& vDev, uint64_t lclMemSi
|
||||
} else if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
|
||||
if (desc.size_ == 8) {
|
||||
lclMemSize = alignUp(lclMemSize, device.info().minDataTypeAlignSize_) +
|
||||
*reinterpret_cast<const uint64_t*>(values_ + desc.offset_);
|
||||
*reinterpret_cast<const uint64_t*>(values_ + desc.offset_);
|
||||
} else {
|
||||
lclMemSize = alignUp(lclMemSize, device.info().minDataTypeAlignSize_) +
|
||||
*reinterpret_cast<const uint32_t*>(values_ + desc.offset_);
|
||||
*reinterpret_cast<const uint32_t*>(values_ + desc.offset_);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -158,10 +158,11 @@ class KernelParameters : protected HeapObject {
|
||||
execNewVcop_(0),
|
||||
execPfpaVcop_(0),
|
||||
deviceKernelArgs_(false) {
|
||||
totalSize_ = signature.paramsSize() +
|
||||
totalSize_ =
|
||||
signature.paramsSize() +
|
||||
(signature.numMemories() + signature.numSamplers() + signature.numQueues()) * sizeof(void*);
|
||||
values_ = reinterpret_cast<address>(this) +
|
||||
alignUp(sizeof(KernelParameters), PARAMETERS_MIN_ALIGNMENT);
|
||||
alignUp(sizeof(KernelParameters), PARAMETERS_MIN_ALIGNMENT);
|
||||
memoryObjOffset_ = signature_.paramsSize();
|
||||
memoryObjects_ = reinterpret_cast<amd::Memory**>(values_ + memoryObjOffset_);
|
||||
samplerObjOffset_ = memoryObjOffset_ + signature_.numMemories() * sizeof(amd::Memory*);
|
||||
@@ -186,7 +187,7 @@ class KernelParameters : protected HeapObject {
|
||||
execPfpaVcop_(rhs.execPfpaVcop_),
|
||||
deviceKernelArgs_(false) {
|
||||
values_ = reinterpret_cast<address>(this) +
|
||||
alignUp(sizeof(KernelParameters), PARAMETERS_MIN_ALIGNMENT);
|
||||
alignUp(sizeof(KernelParameters), PARAMETERS_MIN_ALIGNMENT);
|
||||
memoryObjOffset_ = signature_.paramsSize();
|
||||
memoryObjects_ = reinterpret_cast<amd::Memory**>(values_ + memoryObjOffset_);
|
||||
samplerObjOffset_ = memoryObjOffset_ + signature_.numMemories() * sizeof(amd::Memory*);
|
||||
@@ -223,7 +224,8 @@ class KernelParameters : protected HeapObject {
|
||||
//! Allocate memory for this instance as well as the required storage for
|
||||
// the values_, defined_, and rawPointer_ arrays.
|
||||
void* operator new(size_t size, const KernelSignature& signature) {
|
||||
size_t requiredSize = alignUp(size, PARAMETERS_MIN_ALIGNMENT) + signature.paramsSize() +
|
||||
size_t requiredSize =
|
||||
alignUp(size, PARAMETERS_MIN_ALIGNMENT) + signature.paramsSize() +
|
||||
(signature.numMemories() + signature.numSamplers() + signature.numQueues()) * sizeof(void*);
|
||||
return AlignedMemory::allocate(requiredSize, PARAMETERS_MIN_ALIGNMENT);
|
||||
}
|
||||
|
||||
@@ -57,9 +57,9 @@ bool HostMemoryReference::allocateMemory(size_t size, const Context& context) {
|
||||
size_t memoryAlignment = (CPU_MEMORY_ALIGNMENT_SIZE <= 0) ? 256 : CPU_MEMORY_ALIGNMENT_SIZE;
|
||||
size_ = amd::alignUp(size, memoryAlignment);
|
||||
//! \note memory size must be aligned for CAL pinning
|
||||
hostMem_ = CPU_MEMORY_GUARD_PAGES
|
||||
? GuardedMemory::allocate(size_, MEMOBJ_BASE_ADDR_ALIGN, CPU_MEMORY_GUARD_PAGE_SIZE * Ki)
|
||||
: context.hostAlloc(size_, MEMOBJ_BASE_ADDR_ALIGN);
|
||||
hostMem_ = CPU_MEMORY_GUARD_PAGES ? GuardedMemory::allocate(size_, MEMOBJ_BASE_ADDR_ALIGN,
|
||||
CPU_MEMORY_GUARD_PAGE_SIZE * Ki)
|
||||
: context.hostAlloc(size_, MEMOBJ_BASE_ADDR_ALIGN);
|
||||
alloced_ = (hostMem_ != NULL);
|
||||
return alloced_;
|
||||
}
|
||||
@@ -146,7 +146,7 @@ Memory::Memory(Memory& parent, Flags flags, size_t origin, size_t size, Type typ
|
||||
|
||||
if ((flags_ & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) == 0) {
|
||||
flags_ |= parent_->getMemFlags() &
|
||||
(CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS);
|
||||
(CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -590,8 +590,8 @@ bool Buffer::isEntirelyCovered(const Coord3D& origin, const Coord3D& region) con
|
||||
|
||||
bool Buffer::validateRegion(const Coord3D& origin, const Coord3D& region) const {
|
||||
return ((region[0] > 0) && (origin[0] < getSize()) && ((origin[0] + region[0]) <= getSize()))
|
||||
? true
|
||||
: false;
|
||||
? true
|
||||
: false;
|
||||
}
|
||||
|
||||
void Pipe::initDeviceMemory() {
|
||||
@@ -614,7 +614,7 @@ Image::Image(const Format& format, Image& parent, uint baseMipLevel, cl_mem_flag
|
||||
baseMipLevel_(baseMipLevel) {
|
||||
if (baseMipLevel > 0) {
|
||||
impl_.region_.c[0] = GETMIPDIM(parent.getWidth(), baseMipLevel) *
|
||||
parent.getImageFormat().getElementSize() / format.getElementSize();
|
||||
parent.getImageFormat().getElementSize() / format.getElementSize();
|
||||
impl_.region_.c[1] = GETMIPDIM(parent.getHeight(), baseMipLevel);
|
||||
impl_.region_.c[2] = GETMIPDIM(parent.getDepth(), baseMipLevel);
|
||||
|
||||
@@ -1030,9 +1030,9 @@ const cl_image_format Image::supportedFormats[] = {
|
||||
{CL_DEPTH, CL_FLOAT},
|
||||
};
|
||||
|
||||
const uint32_t NUM_CHANNEL_ORDER_OF_RGB = 1; // The number of channel orders of RGB at the end of
|
||||
// the table supportedFormats above and before sRGB
|
||||
// and depth.
|
||||
const uint32_t NUM_CHANNEL_ORDER_OF_RGB = 1; // The number of channel orders of RGB at the end of
|
||||
// the table supportedFormats above and before sRGB
|
||||
// and depth.
|
||||
const uint32_t NUM_CHANNEL_ORDER_OF_sRGB = 1; // The number of channel orders of sRGB at the end of
|
||||
// the table supportedFormats above and before depth.
|
||||
const uint32_t NUM_CHANNEL_ORDER_OF_DEPTH =
|
||||
@@ -1246,8 +1246,8 @@ Image* Image::createView(const Context& context, const Format& format, device::V
|
||||
bool Image::isEntirelyCovered(const Coord3D& origin, const Coord3D& region) const {
|
||||
return (origin[0] == 0 && origin[1] == 0 && origin[2] == 0 && region[0] == getWidth() &&
|
||||
region[1] == getHeight() && region[2] == getDepth())
|
||||
? true
|
||||
: false;
|
||||
? true
|
||||
: false;
|
||||
}
|
||||
|
||||
bool Image::validateRegion(const Coord3D& origin, const Coord3D& region) const {
|
||||
@@ -1255,15 +1255,15 @@ bool Image::validateRegion(const Coord3D& origin, const Coord3D& region) const {
|
||||
(region[0] != 0) && (origin[1] < getHeight()) && (region[1] != 0) &&
|
||||
(origin[2] < getDepth()) && (region[2] != 0) && ((origin[0] + region[0]) <= getWidth()) &&
|
||||
((origin[1] + region[1]) <= getHeight()) && ((origin[2] + region[2]) <= getDepth()))
|
||||
? true
|
||||
: false;
|
||||
? true
|
||||
: false;
|
||||
}
|
||||
|
||||
bool Image::isRowSliceValid(size_t rowPitch, size_t slice, size_t width, size_t height) const {
|
||||
size_t tmpHeight = (getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) ? 1 : height;
|
||||
|
||||
bool valid = (rowPitch == 0) ||
|
||||
((rowPitch != 0) && (rowPitch >= width * getImageFormat().getElementSize()));
|
||||
((rowPitch != 0) && (rowPitch >= width * getImageFormat().getElementSize()));
|
||||
|
||||
return ((slice == 0) || ((slice != 0) && (slice >= rowPitch * tmpHeight))) ? valid : false;
|
||||
}
|
||||
|
||||
@@ -530,7 +530,7 @@ class Image : public Memory {
|
||||
//! Compare 2 image formats.
|
||||
bool operator==(const Format& rhs) const {
|
||||
return image_channel_order == rhs.image_channel_order &&
|
||||
image_channel_data_type == rhs.image_channel_data_type;
|
||||
image_channel_data_type == rhs.image_channel_data_type;
|
||||
}
|
||||
bool operator!=(const Format& rhs) const { return !(*this == rhs); }
|
||||
|
||||
|
||||
@@ -170,8 +170,8 @@ int32_t Program::addDeviceProgram(Device& device, const void* image, size_t leng
|
||||
}
|
||||
}
|
||||
options->oVariables->Legacy = !device.settings().useLightning_
|
||||
? isAMDILTarget(*amd::aclutGetTargetInfo(binary))
|
||||
: isHSAILTarget(*amd::aclutGetTargetInfo(binary));
|
||||
? isAMDILTarget(*amd::aclutGetTargetInfo(binary))
|
||||
: isHSAILTarget(*amd::aclutGetTargetInfo(binary));
|
||||
amd::Hsail::BinaryFini(binary);
|
||||
}
|
||||
#endif // defined(WITH_COMPILER_LIB)
|
||||
@@ -522,9 +522,8 @@ int32_t Program::build(const std::vector<Device*>& devices, const char* options,
|
||||
for (const auto& it : devices) {
|
||||
option::Options parsedOptions;
|
||||
constexpr bool LinkOptsOnly = false;
|
||||
if ((language_ != HIP) &&
|
||||
!ParseAllOptions(cppstr, parsedOptions, optionChangable, LinkOptsOnly,
|
||||
it->settings().useLightning_)) {
|
||||
if ((language_ != HIP) && !ParseAllOptions(cppstr, parsedOptions, optionChangable, LinkOptsOnly,
|
||||
it->settings().useLightning_)) {
|
||||
programLog_ = parsedOptions.optionsLog();
|
||||
LogError("Parsing compile options failed.");
|
||||
return CL_INVALID_COMPILER_OPTIONS;
|
||||
|
||||
+252
-531
@@ -21,537 +21,258 @@
|
||||
#ifndef FLAGS_HPP_
|
||||
#define FLAGS_HPP_
|
||||
|
||||
|
||||
#define RUNTIME_FLAGS(debug, release, release_on_stg) \
|
||||
\
|
||||
release(int, AMD_LOG_LEVEL, 0, "The default log level") release( \
|
||||
uint, AMD_LOG_MASK, 0X7FFFFFFF, \
|
||||
"The mask to enable specific kinds of logs") release(cstring, AMD_LOG_LEVEL_FILE, "", \
|
||||
"Set output file for AMD_LOG_LEVEL, " \
|
||||
"Default is stderr") release(size_t, \
|
||||
AMD_LOG_LEVEL_SIZE, \
|
||||
2048, \
|
||||
"The max " \
|
||||
"size of " \
|
||||
"AMD_LOG " \
|
||||
"generate" \
|
||||
"d in MB " \
|
||||
"if " \
|
||||
"printed " \
|
||||
"to a " \
|
||||
"file") \
|
||||
debug(uint, DEBUG_GPU_FLAGS, 0, "The debug options for GPU device") release( \
|
||||
size_t, CQ_THREAD_STACK_SIZE, 256 * Ki, /* @todo: that much! */ \
|
||||
"The default command queue thread stack size") release(int, GPU_MAX_WORKGROUP_SIZE, 0, \
|
||||
"Maximum number of workitems in " \
|
||||
"a workgroup for GPU, 0 -use " \
|
||||
"default") \
|
||||
debug(bool, CPU_MEMORY_GUARD_PAGES, false, "Use guard pages for CPU memory") debug( \
|
||||
size_t, CPU_MEMORY_GUARD_PAGE_SIZE, \
|
||||
64, "Size in KB of CPU memory guard page") debug(size_t, CPU_MEMORY_ALIGNMENT_SIZE, \
|
||||
256, \
|
||||
"Size in bytes for the default " \
|
||||
"alignment for guarded memory on " \
|
||||
"CPU") debug(size_t, \
|
||||
PARAMETERS_MIN_ALIGNMENT, \
|
||||
NATIVE_ALIGNMENT_SIZE, \
|
||||
"Minimum alignment " \
|
||||
"required for the " \
|
||||
"abstract parameters " \
|
||||
"stack") debug(size_t, \
|
||||
MEMOBJ_BASE_ADDR_ALIGN, \
|
||||
4 * Ki, \
|
||||
"Align" \
|
||||
"ment " \
|
||||
"of " \
|
||||
"the " \
|
||||
"base " \
|
||||
"addre" \
|
||||
"ss " \
|
||||
"of " \
|
||||
"any " \
|
||||
"alloc" \
|
||||
"ate " \
|
||||
"memor" \
|
||||
"y " \
|
||||
"objec" \
|
||||
"t") \
|
||||
release( \
|
||||
uint, ROC_HMM_FLAGS, \
|
||||
0, "ROCm HMM configuration flags") release(cstring, GPU_DEVICE_ORDINAL, "", \
|
||||
"Select the device ordinal (comma " \
|
||||
"seperated list of available " \
|
||||
"devices)") release(bool, \
|
||||
REMOTE_ALLOC, \
|
||||
false, \
|
||||
"Use remote " \
|
||||
"memory for the " \
|
||||
"global heap " \
|
||||
"allocation") \
|
||||
release(uint, GPU_CP_DMA_COPY_SIZE, 1, \
|
||||
"Set maximum size of CP DMA copy in KiB") release(uint, \
|
||||
GPU_MAX_HEAP_SIZE, \
|
||||
100, \
|
||||
"Set maximum size of " \
|
||||
"the GPU heap to % " \
|
||||
"of board memory") \
|
||||
release( \
|
||||
uint, GPU_STAGING_BUFFER_SIZE, 4, \
|
||||
"Size of the GPU staging buffer in MiB") release(bool, \
|
||||
GPU_DUMP_BLIT_KERNELS, \
|
||||
false, \
|
||||
"Dump the kernels for " \
|
||||
"blit manager") \
|
||||
release(uint, GPU_BLIT_ENGINE_TYPE, 0x0, \
|
||||
"Blit engine type: 0 - Default, 1 - Host, 2 - CAL, 3 - Kernel") \
|
||||
release(bool, GPU_FLUSH_ON_EXECUTION, false, \
|
||||
"Submit commands to HW on every operation. 0 - Disable, 1 " \
|
||||
"- Enable") release(bool, CL_KHR_FP64, true, \
|
||||
"Enable/Disable support for double " \
|
||||
"precision") release(cstring, \
|
||||
AMD_OCL_BUILD_OPTIONS, \
|
||||
0, \
|
||||
"Set " \
|
||||
"clBuildProgram() " \
|
||||
"and " \
|
||||
"clCompileProgram(" \
|
||||
")'s options " \
|
||||
"(override)") \
|
||||
release(cstring, AMD_OCL_BUILD_OPTIONS_APPEND, 0, \
|
||||
"Append clBuildProgram() and clCompileProgram()'s " \
|
||||
"options") release(cstring, AMD_OCL_LINK_OPTIONS, 0, \
|
||||
"Set clLinkProgram()'s options " \
|
||||
"(override)") \
|
||||
release( \
|
||||
cstring, AMD_OCL_LINK_OPTIONS_APPEND, 0, \
|
||||
"Append clLinkProgram()'s options") debug(cstring, \
|
||||
AMD_OCL_SUBST_OBJFILE, \
|
||||
0, \
|
||||
"Specify " \
|
||||
"binary " \
|
||||
"substitution" \
|
||||
" config " \
|
||||
"file for " \
|
||||
"OpenCL") \
|
||||
release( \
|
||||
size_t, GPU_PINNED_XFER_SIZE, 32, \
|
||||
"The pinned buffer size for pinning in read/write " \
|
||||
"transfers in MiB") release(size_t, \
|
||||
GPU_PINNED_MIN_XFER_SIZE, \
|
||||
128, \
|
||||
"The minimal buffer " \
|
||||
"size for pinned " \
|
||||
"read/write transfers " \
|
||||
"in MiB") release(size_t, \
|
||||
GPU_RESOURCE_CACHE_SIZE, \
|
||||
64, \
|
||||
"The " \
|
||||
"reso" \
|
||||
"urce" \
|
||||
" cac" \
|
||||
"he " \
|
||||
"size" \
|
||||
" in " \
|
||||
"MB") \
|
||||
release( \
|
||||
size_t, GPU_MAX_SUBALLOC_SIZE, 4096, \
|
||||
"The maximum size accepted for suballocations " \
|
||||
"in KB") release(size_t, GPU_NUM_MEM_DEPENDENCY, \
|
||||
256, \
|
||||
"Number of memory objects for " \
|
||||
"dependency tracking") \
|
||||
release( \
|
||||
size_t, GPU_XFER_BUFFER_SIZE, 0, \
|
||||
"Transfer buffer size for image copy " \
|
||||
"optimization in KB") release(bool, \
|
||||
GPU_IMAGE_DMA, \
|
||||
true, \
|
||||
"Enable DRM " \
|
||||
"DMA for " \
|
||||
"image " \
|
||||
"transfers") \
|
||||
release( \
|
||||
uint, GPU_SINGLE_ALLOC_PERCENT, 100, \
|
||||
"Maximum size of a single allocation " \
|
||||
"as percentage of total") release(uint, \
|
||||
GPU_NUM_COMPUTE_RINGS, \
|
||||
2, \
|
||||
"GPU " \
|
||||
"numb" \
|
||||
"er " \
|
||||
"of " \
|
||||
"comp" \
|
||||
"ute " \
|
||||
"ring" \
|
||||
"s. " \
|
||||
"0 - " \
|
||||
"disa" \
|
||||
"bled" \
|
||||
", 1 " \
|
||||
", " \
|
||||
"2,.." \
|
||||
" - " \
|
||||
"the " \
|
||||
"numb" \
|
||||
"er " \
|
||||
"of " \
|
||||
"comp" \
|
||||
"ute " \
|
||||
"ring" \
|
||||
"s") \
|
||||
release( \
|
||||
bool, AMD_OCL_WAIT_COMMAND, false, \
|
||||
"1 = Enable a wait for every " \
|
||||
"submitted command") release(uint, \
|
||||
GPU_PRINT_CHILD_KERNEL, \
|
||||
0, \
|
||||
"Print" \
|
||||
"s " \
|
||||
"the " \
|
||||
"speci" \
|
||||
"fied " \
|
||||
"numbe" \
|
||||
"r of " \
|
||||
"the " \
|
||||
"child" \
|
||||
" kern" \
|
||||
"els") \
|
||||
release(bool, GPU_USE_DEVICE_QUEUE, \
|
||||
false, \
|
||||
"Use a dedicated device " \
|
||||
"queue for the actual " \
|
||||
"submissions") release(bool, \
|
||||
AMD_THREAD_TRACE_ENABLE, \
|
||||
true, \
|
||||
"Ena" \
|
||||
"ble" \
|
||||
" th" \
|
||||
"rea" \
|
||||
"d " \
|
||||
"tra" \
|
||||
"ce " \
|
||||
"ext" \
|
||||
"ens" \
|
||||
"io" \
|
||||
"n") \
|
||||
release( \
|
||||
uint, OPENCL_VERSION, 200, \
|
||||
"Force GPU opencl version") release(bool, \
|
||||
HSA_LOCAL_MEMORY_ENABLE, \
|
||||
true, \
|
||||
"Enable HSA device local memory usage") \
|
||||
release( \
|
||||
uint, \
|
||||
HSA_KERNARG_POOL_SIZE, \
|
||||
1024 * 1024, \
|
||||
"Kernarg pool size") release(bool, \
|
||||
GPU_MIPMAP, \
|
||||
true, \
|
||||
"Enables GPU mipmap extension") \
|
||||
release( \
|
||||
uint, \
|
||||
GPU_ENABLE_PAL, \
|
||||
2, \
|
||||
"Enables PAL " \
|
||||
"backend. 0 - ROC, " \
|
||||
"1 - PAL, 2 - ROC " \
|
||||
"or PAL") release(bool, DISABLE_DEFERRED_ALLOC, \
|
||||
false, \
|
||||
"Disables deferred memory allocation on device") \
|
||||
release( \
|
||||
int, \
|
||||
AMD_GPU_FORCE_SINGLE_FP_DENORM, \
|
||||
-1, \
|
||||
"Force denorm " \
|
||||
"for single " \
|
||||
"precision: -1 " \
|
||||
"- don't " \
|
||||
"force, 0 - " \
|
||||
"disable, 1 - " \
|
||||
"enable") \
|
||||
release( \
|
||||
uint, \
|
||||
OCL_SET_SVM_SIZE, \
|
||||
4 * 16384, \
|
||||
"set SVM " \
|
||||
"space " \
|
||||
"size for " \
|
||||
"discrete " \
|
||||
"GPU") release(uint, \
|
||||
GPU_WAVES_PER_SIMD, \
|
||||
0, \
|
||||
"Force the number of waves per SIMD (1-10)") \
|
||||
release( \
|
||||
bool, \
|
||||
OCL_STUB_PROGRAMS, \
|
||||
false, \
|
||||
"1 = " \
|
||||
"Enable" \
|
||||
"s OCL " \
|
||||
"progra" \
|
||||
"ms " \
|
||||
"stubin" \
|
||||
"g") \
|
||||
release( \
|
||||
bool, \
|
||||
GPU_ANALYZE_HANG, \
|
||||
false, \
|
||||
"1 " \
|
||||
"= " \
|
||||
"En" \
|
||||
"ab" \
|
||||
"le" \
|
||||
"s " \
|
||||
"GP" \
|
||||
"U " \
|
||||
"ha" \
|
||||
"ng" \
|
||||
" a" \
|
||||
"na" \
|
||||
"ly" \
|
||||
"si" \
|
||||
"s") \
|
||||
release( \
|
||||
uint, \
|
||||
GPU_MAX_REMOTE_MEM_SIZE, \
|
||||
2, \
|
||||
"Maximum size (in Ki) that allows device memory substitution with system") \
|
||||
release(bool, \
|
||||
GPU_ADD_HBCC_SIZE, \
|
||||
false, \
|
||||
"Add HBCC size to the reported device memory") release(bool, \
|
||||
PAL_DISABLE_SDMA, \
|
||||
false, \
|
||||
"1 = Disable SDMA for PAL") release(uint, \
|
||||
PAL_RGP_DISP_COUNT, \
|
||||
10000, \
|
||||
"The number of dispatches for RGP capture with SQTT") release(uint, \
|
||||
PAL_MALL_POLICY, \
|
||||
0, \
|
||||
"Controls the behaviour of allocations with respect to the MALL" \
|
||||
"0 = MALL policy is decided by KMD" \
|
||||
"1 = Allocations are never put through the MALL" \
|
||||
"2 = Allocations will always be put through the MALL") release(bool, \
|
||||
GPU_ENABLE_WAVE32_MODE, \
|
||||
true, \
|
||||
"Enables Wave32 compilation in HW if available") release(bool, \
|
||||
GPU_ENABLE_LC, \
|
||||
true, \
|
||||
"Enables LC path") release(bool, GPU_ENABLE_HW_P2P, \
|
||||
false, \
|
||||
"Enables HW P2P path") release(bool, \
|
||||
GPU_ENABLE_COOP_GROUPS, \
|
||||
true, \
|
||||
"Enables cooperative group launch") release(uint, \
|
||||
GPU_MAX_COMMAND_BUFFERS, \
|
||||
8, \
|
||||
"The maximum number of command buffers allocated per queue") release(uint, \
|
||||
GPU_MAX_HW_QUEUES, \
|
||||
4, \
|
||||
"The maximum number of HW queues allocated per device") release(bool, GPU_IMAGE_BUFFER_WAR, true, \
|
||||
"Enables image buffer workaround") release(cstring, \
|
||||
HIP_VISIBLE_DEVICES, \
|
||||
"", \
|
||||
"Only devices whose index is present in the sequence are visible to HIP") release(cstring, \
|
||||
CUDA_VISIBLE_DEVICES, \
|
||||
"", \
|
||||
"Only devices whose index is present in the sequence are visible to CUDA") \
|
||||
release(bool, \
|
||||
GPU_ENABLE_WGP_MODE, \
|
||||
true, \
|
||||
"Enables WGP Mode in HW if available") \
|
||||
release( \
|
||||
bool, \
|
||||
GPU_DUMP_CODE_OBJECT, \
|
||||
false, \
|
||||
"Enable dump code object") release(uint, \
|
||||
GPU_MAX_USWC_ALLOC_SIZE, 2048, \
|
||||
"Set a limit in Mb on the maximum USWC allocation size" \
|
||||
"-1 = No limit") \
|
||||
release( \
|
||||
uint, \
|
||||
AMD_SERIALIZE_KERNEL, \
|
||||
0, \
|
||||
"Serialize kernel enqueue, 0x1 = Wait for completion before enqueue" \
|
||||
"0x2 = Wait for completion after enqueue 0x3 = both") release(uint, \
|
||||
AMD_SERIALIZE_COPY, \
|
||||
0, \
|
||||
"Serialize copies, 0x1 = Wait for completion before enqueue" \
|
||||
"0x2 = Wait for completion after enqueue 0x3 = both") release(uint, \
|
||||
HIP_LAUNCH_BLOCKING, \
|
||||
0, \
|
||||
"Serialize kernel enqueue 0x1 = Wait for completion after enqueue," \
|
||||
"same as AMD_SERIALIZE_KERNEL=2") release(bool, \
|
||||
PAL_ALWAYS_RESIDENT, \
|
||||
false, \
|
||||
"Force memory resources to become resident at allocation time") release(uint, \
|
||||
HIP_HOST_COHERENT, \
|
||||
0, \
|
||||
"Coherent memory in hipHostMalloc, 0x1 = memory is coherent with host" \
|
||||
"0x0 = memory is not coherent between host and GPU") release(uint, AMD_OPT_FLUSH, 1, \
|
||||
"Kernel flush option , 0x0 = Use system-scope fence operations." \
|
||||
"0x1 = Use device-scope fence operations when possible.") \
|
||||
release( \
|
||||
bool, \
|
||||
AMD_DIRECT_DISPATCH, \
|
||||
false, \
|
||||
"Enable direct kernel dispatch.") release(uint, \
|
||||
HIP_HIDDEN_FREE_MEM, \
|
||||
0, \
|
||||
"Reserve free mem reporting in Mb" \
|
||||
"0 = Disable") release(size_t, \
|
||||
GPU_FORCE_BLIT_COPY_SIZE, \
|
||||
16, \
|
||||
"Use Blit until this size(in KB) for copies") release(uint, \
|
||||
ROC_ACTIVE_WAIT_TIMEOUT, \
|
||||
0, \
|
||||
"Forces active wait of GPU interrup for the timeout(us)") release(bool, \
|
||||
ROC_ENABLE_LARGE_BAR, \
|
||||
true, \
|
||||
"Enable Large Bar if supported by the device") release(bool, \
|
||||
ROC_CPU_WAIT_FOR_SIGNAL, \
|
||||
true, \
|
||||
"Enable CPU wait for dependent HSA signals.") release(bool, \
|
||||
ROC_SYSTEM_SCOPE_SIGNAL, \
|
||||
true, \
|
||||
"Enable system scope for signals (uses interrupts).") release(bool, \
|
||||
GPU_FORCE_QUEUE_PROFILING, \
|
||||
false, \
|
||||
"Force command queue profiling by default") \
|
||||
release( \
|
||||
bool, \
|
||||
HIP_MEM_POOL_SUPPORT, \
|
||||
true, \
|
||||
"Enables memory pool support in HIP") release(bool, \
|
||||
HIP_MEM_POOL_USE_VM, \
|
||||
true, \
|
||||
"Enables memory pool support in HIP") release(bool, \
|
||||
DEBUG_HIP_MEM_POOL_VMHEAP, \
|
||||
true, \
|
||||
"Enables virtual memory for memory pools") release(bool, \
|
||||
PAL_HIP_IPC_FLAG, true, \
|
||||
"Enable interprocess flag for device allocation in PAL HIP") \
|
||||
release( \
|
||||
uint, \
|
||||
PAL_FORCE_ASIC_REVISION, \
|
||||
0, \
|
||||
"Force a specific asic revision for all devices") \
|
||||
release( \
|
||||
bool, \
|
||||
PAL_EMBED_KERNEL_MD, \
|
||||
false, \
|
||||
"Enables writing kernel metadata into command buffers.") release(cstring, \
|
||||
ROC_GLOBAL_CU_MASK, \
|
||||
"", \
|
||||
"Sets a global CU mask (entered as hex value) for all queues," \
|
||||
"Each active bit represents using one CU (e.g., 0xf enables only 4 CUs)") release(size_t, PAL_PREPINNED_MEMORY_SIZE, 64, \
|
||||
"Size in KBytes of prepinned memory") release(bool, \
|
||||
AMD_CPU_AFFINITY, \
|
||||
false, \
|
||||
"Reset CPU affinity of any runtime threads") release(bool, \
|
||||
ROC_USE_FGS_KERNARG, \
|
||||
true, \
|
||||
"Use fine grain kernel args segment for supported asics") release(uint, \
|
||||
ROC_P2P_SDMA_SIZE, \
|
||||
1024, \
|
||||
"The minimum size in KB for P2P transfer with SDMA") release(uint, \
|
||||
ROC_AQL_QUEUE_SIZE, \
|
||||
16384, \
|
||||
"AQL queue size in AQL packets") \
|
||||
release( \
|
||||
uint, \
|
||||
ROC_SIGNAL_POOL_SIZE, \
|
||||
64, \
|
||||
"Initial size of HSA signal pool") \
|
||||
release(uint, \
|
||||
DEBUG_CLR_LIMIT_BLIT_WG, \
|
||||
16, \
|
||||
"Limit the number of workgroups in blit operations") release(bool, \
|
||||
DEBUG_CLR_BLIT_KERNARG_OPT, \
|
||||
false, \
|
||||
"Enable blit kernel arguments optimization") release(bool, \
|
||||
ROC_SKIP_KERNEL_ARG_COPY, \
|
||||
false, \
|
||||
"If true, then runtime can skip kernel arg copy") release(bool, \
|
||||
GPU_STREAMOPS_CP_WAIT, \
|
||||
false, \
|
||||
"Force the stream wait memory operation to wait on CP.") release(bool, HIPRTC_USE_RUNTIME_UNBUNDLER, \
|
||||
false, \
|
||||
"Set this to true to force runtime unbundler in hiprtc.") release(size_t, \
|
||||
HIP_INITIAL_DM_SIZE, \
|
||||
8 * Mi, \
|
||||
"Set initial heap size for device malloc.") \
|
||||
release( \
|
||||
bool, \
|
||||
HIP_FORCE_DEV_KERNARG, \
|
||||
true, \
|
||||
"Force device mem for kernel args.") release(bool, \
|
||||
DEBUG_CLR_GRAPH_PACKET_CAPTURE, \
|
||||
true, \
|
||||
"Enable/Disable graph packet capturing") release(bool, \
|
||||
GPU_DEBUG_ENABLE, false, \
|
||||
"Enables collection of extra info for debugger at some perf cost") \
|
||||
release( \
|
||||
cstring, \
|
||||
HIPRTC_COMPILE_OPTIONS_APPEND, \
|
||||
"", \
|
||||
"Set compile options needed for hiprtc compilation") \
|
||||
release( \
|
||||
cstring, \
|
||||
HIPRTC_LINK_OPTIONS_APPEND, \
|
||||
"", \
|
||||
"Set link options needed for hiprtc compilation") \
|
||||
release( \
|
||||
bool, \
|
||||
HIP_VMEM_MANAGE_SUPPORT, \
|
||||
true, \
|
||||
"Virtual Memory Management Support") \
|
||||
release( \
|
||||
bool, \
|
||||
DEBUG_HIP_GRAPH_DOT_PRINT, \
|
||||
false, \
|
||||
"Enable/Disable graph debug dot print dump") release(bool, DEBUG_HIP_FORCE_ASYNC_QUEUE, false, \
|
||||
"Forces grpahs into async queue mode. DEBUG_HIP_FORCE_GRAPH_QUEUES must be 1") \
|
||||
release( \
|
||||
uint, \
|
||||
DEBUG_HIP_FORCE_GRAPH_QUEUES, \
|
||||
4, \
|
||||
"Forces the number of streams for the graph parallel execution") \
|
||||
release( \
|
||||
uint, \
|
||||
DEBUG_HIP_BLOCK_SYNC, \
|
||||
50, \
|
||||
"Blocks synchronization on CPU until the callback processing is done") \
|
||||
release(uint, \
|
||||
DEBUG_CLR_MAX_BATCH_SIZE, \
|
||||
1000, \
|
||||
"Forces the callback to clean-up CPU submission queue") release(bool, DEBUG_CLR_SYSMEM_POOL, false, \
|
||||
"Use sysmem pool implementation in runtime for amd commands") \
|
||||
release(bool, \
|
||||
DEBUG_HIP_KERNARG_COPY_OPT, \
|
||||
true, \
|
||||
"Enable/Disable multiple kern arg copies") release(bool, \
|
||||
DEBUG_CLR_KERNARG_HDP_FLUSH_WA, \
|
||||
false, \
|
||||
"Toggle kernel arg copy workaround") release(bool, \
|
||||
DEBUG_HIP_DYNAMIC_QUEUES, \
|
||||
false, \
|
||||
"Forces dynamic queue management") \
|
||||
release( \
|
||||
uint, \
|
||||
HIP_SKIP_ABORT_ON_GPU_ERROR, \
|
||||
true, \
|
||||
"Set this to true, to avoid host side abort for GPU errors") \
|
||||
release( \
|
||||
bool, \
|
||||
HIP_FORCE_SPIRV_CODEOBJECT, \
|
||||
false, \
|
||||
"Force use of SPIRV instead of device specific code object.") \
|
||||
release( \
|
||||
uint, \
|
||||
DEBUG_CLR_BATCH_CPU_SYNC_SIZE, \
|
||||
8, \
|
||||
"Forces the minimum batch size for CPU sync")
|
||||
// clang-format off
|
||||
#define RUNTIME_FLAGS(debug,release,release_on_stg) \
|
||||
\
|
||||
release(int, AMD_LOG_LEVEL, 0, \
|
||||
"The default log level") \
|
||||
release(uint, AMD_LOG_MASK, 0X7FFFFFFF, \
|
||||
"The mask to enable specific kinds of logs") \
|
||||
release(cstring, AMD_LOG_LEVEL_FILE, "", \
|
||||
"Set output file for AMD_LOG_LEVEL, Default is stderr") \
|
||||
release(size_t, AMD_LOG_LEVEL_SIZE, 2048, \
|
||||
"The max size of AMD_LOG generated in MB if printed to a file") \
|
||||
debug(uint, DEBUG_GPU_FLAGS, 0, \
|
||||
"The debug options for GPU device") \
|
||||
release(size_t, CQ_THREAD_STACK_SIZE, 256*Ki, /* @todo: that much! */ \
|
||||
"The default command queue thread stack size") \
|
||||
release(int, GPU_MAX_WORKGROUP_SIZE, 0, \
|
||||
"Maximum number of workitems in a workgroup for GPU, 0 -use default") \
|
||||
debug(bool, CPU_MEMORY_GUARD_PAGES, false, \
|
||||
"Use guard pages for CPU memory") \
|
||||
debug(size_t, CPU_MEMORY_GUARD_PAGE_SIZE, 64, \
|
||||
"Size in KB of CPU memory guard page") \
|
||||
debug(size_t, CPU_MEMORY_ALIGNMENT_SIZE, 256, \
|
||||
"Size in bytes for the default alignment for guarded memory on CPU") \
|
||||
debug(size_t, PARAMETERS_MIN_ALIGNMENT, NATIVE_ALIGNMENT_SIZE, \
|
||||
"Minimum alignment required for the abstract parameters stack") \
|
||||
debug(size_t, MEMOBJ_BASE_ADDR_ALIGN, 4*Ki, \
|
||||
"Alignment of the base address of any allocate memory object") \
|
||||
release(uint, ROC_HMM_FLAGS, 0, \
|
||||
"ROCm HMM configuration flags") \
|
||||
release(cstring, GPU_DEVICE_ORDINAL, "", \
|
||||
"Select the device ordinal (comma seperated list of available devices)") \
|
||||
release(bool, REMOTE_ALLOC, false, \
|
||||
"Use remote memory for the global heap allocation") \
|
||||
release(uint, GPU_CP_DMA_COPY_SIZE, 1, \
|
||||
"Set maximum size of CP DMA copy in KiB") \
|
||||
release(uint, GPU_MAX_HEAP_SIZE, 100, \
|
||||
"Set maximum size of the GPU heap to % of board memory") \
|
||||
release(uint, GPU_STAGING_BUFFER_SIZE, 4, \
|
||||
"Size of the GPU staging buffer in MiB") \
|
||||
release(bool, GPU_DUMP_BLIT_KERNELS, false, \
|
||||
"Dump the kernels for blit manager") \
|
||||
release(uint, GPU_BLIT_ENGINE_TYPE, 0x0, \
|
||||
"Blit engine type: 0 - Default, 1 - Host, 2 - CAL, 3 - Kernel") \
|
||||
release(bool, GPU_FLUSH_ON_EXECUTION, false, \
|
||||
"Submit commands to HW on every operation. 0 - Disable, 1 - Enable") \
|
||||
release(bool, CL_KHR_FP64, true, \
|
||||
"Enable/Disable support for double precision") \
|
||||
release(cstring, AMD_OCL_BUILD_OPTIONS, 0, \
|
||||
"Set clBuildProgram() and clCompileProgram()'s options (override)") \
|
||||
release(cstring, AMD_OCL_BUILD_OPTIONS_APPEND, 0, \
|
||||
"Append clBuildProgram() and clCompileProgram()'s options") \
|
||||
release(cstring, AMD_OCL_LINK_OPTIONS, 0, \
|
||||
"Set clLinkProgram()'s options (override)") \
|
||||
release(cstring, AMD_OCL_LINK_OPTIONS_APPEND, 0, \
|
||||
"Append clLinkProgram()'s options") \
|
||||
debug(cstring, AMD_OCL_SUBST_OBJFILE, 0, \
|
||||
"Specify binary substitution config file for OpenCL") \
|
||||
release(size_t, GPU_PINNED_XFER_SIZE, 32, \
|
||||
"The pinned buffer size for pinning in read/write transfers in MiB") \
|
||||
release(size_t, GPU_PINNED_MIN_XFER_SIZE, 128, \
|
||||
"The minimal buffer size for pinned read/write transfers in MiB") \
|
||||
release(size_t, GPU_RESOURCE_CACHE_SIZE, 64, \
|
||||
"The resource cache size in MB") \
|
||||
release(size_t, GPU_MAX_SUBALLOC_SIZE, 4096, \
|
||||
"The maximum size accepted for suballocations in KB") \
|
||||
release(size_t, GPU_NUM_MEM_DEPENDENCY, 256, \
|
||||
"Number of memory objects for dependency tracking") \
|
||||
release(size_t, GPU_XFER_BUFFER_SIZE, 0, \
|
||||
"Transfer buffer size for image copy optimization in KB") \
|
||||
release(bool, GPU_IMAGE_DMA, true, \
|
||||
"Enable DRM DMA for image transfers") \
|
||||
release(uint, GPU_SINGLE_ALLOC_PERCENT, 100, \
|
||||
"Maximum size of a single allocation as percentage of total") \
|
||||
release(uint, GPU_NUM_COMPUTE_RINGS, 2, \
|
||||
"GPU number of compute rings. 0 - disabled, 1 , 2,.. - the number of compute rings") \
|
||||
release(bool, AMD_OCL_WAIT_COMMAND, false, \
|
||||
"1 = Enable a wait for every submitted command") \
|
||||
release(uint, GPU_PRINT_CHILD_KERNEL, 0, \
|
||||
"Prints the specified number of the child kernels") \
|
||||
release(bool, GPU_USE_DEVICE_QUEUE, false, \
|
||||
"Use a dedicated device queue for the actual submissions") \
|
||||
release(bool, AMD_THREAD_TRACE_ENABLE, true, \
|
||||
"Enable thread trace extension") \
|
||||
release(uint, OPENCL_VERSION, 200, \
|
||||
"Force GPU opencl version") \
|
||||
release(bool, HSA_LOCAL_MEMORY_ENABLE, true, \
|
||||
"Enable HSA device local memory usage") \
|
||||
release(uint, HSA_KERNARG_POOL_SIZE, 1024 * 1024, \
|
||||
"Kernarg pool size") \
|
||||
release(bool, GPU_MIPMAP, true, \
|
||||
"Enables GPU mipmap extension") \
|
||||
release(uint, GPU_ENABLE_PAL, 2, \
|
||||
"Enables PAL backend. 0 - ROC, 1 - PAL, 2 - ROC or PAL") \
|
||||
release(bool, DISABLE_DEFERRED_ALLOC, false, \
|
||||
"Disables deferred memory allocation on device") \
|
||||
release(int, AMD_GPU_FORCE_SINGLE_FP_DENORM, -1, \
|
||||
"Force denorm for single precision: -1 - don't force, 0 - disable, 1 - enable") \
|
||||
release(uint, OCL_SET_SVM_SIZE, 4*16384, \
|
||||
"set SVM space size for discrete GPU") \
|
||||
release(uint, GPU_WAVES_PER_SIMD, 0, \
|
||||
"Force the number of waves per SIMD (1-10)") \
|
||||
release(bool, OCL_STUB_PROGRAMS, false, \
|
||||
"1 = Enables OCL programs stubing") \
|
||||
release(bool, GPU_ANALYZE_HANG, false, \
|
||||
"1 = Enables GPU hang analysis") \
|
||||
release(uint, GPU_MAX_REMOTE_MEM_SIZE, 2, \
|
||||
"Maximum size (in Ki) that allows device memory substitution with system") \
|
||||
release(bool, GPU_ADD_HBCC_SIZE, false, \
|
||||
"Add HBCC size to the reported device memory") \
|
||||
release(bool, PAL_DISABLE_SDMA, false, \
|
||||
"1 = Disable SDMA for PAL") \
|
||||
release(uint, PAL_RGP_DISP_COUNT, 10000, \
|
||||
"The number of dispatches for RGP capture with SQTT") \
|
||||
release(uint, PAL_MALL_POLICY, 0, \
|
||||
"Controls the behaviour of allocations with respect to the MALL" \
|
||||
"0 = MALL policy is decided by KMD" \
|
||||
"1 = Allocations are never put through the MALL" \
|
||||
"2 = Allocations will always be put through the MALL") \
|
||||
release(bool, GPU_ENABLE_WAVE32_MODE, true, \
|
||||
"Enables Wave32 compilation in HW if available") \
|
||||
release(bool, GPU_ENABLE_LC, true, \
|
||||
"Enables LC path") \
|
||||
release(bool, GPU_ENABLE_HW_P2P, false, \
|
||||
"Enables HW P2P path") \
|
||||
release(bool, GPU_ENABLE_COOP_GROUPS, true, \
|
||||
"Enables cooperative group launch") \
|
||||
release(uint, GPU_MAX_COMMAND_BUFFERS, 8, \
|
||||
"The maximum number of command buffers allocated per queue") \
|
||||
release(uint, GPU_MAX_HW_QUEUES, 4, \
|
||||
"The maximum number of HW queues allocated per device") \
|
||||
release(bool, GPU_IMAGE_BUFFER_WAR, true, \
|
||||
"Enables image buffer workaround") \
|
||||
release(cstring, HIP_VISIBLE_DEVICES, "", \
|
||||
"Only devices whose index is present in the sequence are visible to HIP") \
|
||||
release(cstring, CUDA_VISIBLE_DEVICES, "", \
|
||||
"Only devices whose index is present in the sequence are visible to CUDA") \
|
||||
release(bool, GPU_ENABLE_WGP_MODE, true, \
|
||||
"Enables WGP Mode in HW if available") \
|
||||
release(bool, GPU_DUMP_CODE_OBJECT, false, \
|
||||
"Enable dump code object") \
|
||||
release(uint, GPU_MAX_USWC_ALLOC_SIZE, 2048, \
|
||||
"Set a limit in Mb on the maximum USWC allocation size" \
|
||||
"-1 = No limit") \
|
||||
release(uint, AMD_SERIALIZE_KERNEL, 0, \
|
||||
"Serialize kernel enqueue, 0x1 = Wait for completion before enqueue" \
|
||||
"0x2 = Wait for completion after enqueue 0x3 = both") \
|
||||
release(uint, AMD_SERIALIZE_COPY, 0, \
|
||||
"Serialize copies, 0x1 = Wait for completion before enqueue" \
|
||||
"0x2 = Wait for completion after enqueue 0x3 = both") \
|
||||
release(uint, HIP_LAUNCH_BLOCKING, 0, \
|
||||
"Serialize kernel enqueue 0x1 = Wait for completion after enqueue," \
|
||||
"same as AMD_SERIALIZE_KERNEL=2") \
|
||||
release(bool, PAL_ALWAYS_RESIDENT, false, \
|
||||
"Force memory resources to become resident at allocation time") \
|
||||
release(uint, HIP_HOST_COHERENT, 0, \
|
||||
"Coherent memory in hipHostMalloc, 0x1 = memory is coherent with host"\
|
||||
"0x0 = memory is not coherent between host and GPU") \
|
||||
release(uint, AMD_OPT_FLUSH, 1, \
|
||||
"Kernel flush option , 0x0 = Use system-scope fence operations." \
|
||||
"0x1 = Use device-scope fence operations when possible.") \
|
||||
release(bool, AMD_DIRECT_DISPATCH, false, \
|
||||
"Enable direct kernel dispatch.") \
|
||||
release(uint, HIP_HIDDEN_FREE_MEM, 0, \
|
||||
"Reserve free mem reporting in Mb" \
|
||||
"0 = Disable") \
|
||||
release(size_t, GPU_FORCE_BLIT_COPY_SIZE, 16, \
|
||||
"Use Blit until this size(in KB) for copies") \
|
||||
release(uint, ROC_ACTIVE_WAIT_TIMEOUT, 0, \
|
||||
"Forces active wait of GPU interrup for the timeout(us)") \
|
||||
release(bool, ROC_ENABLE_LARGE_BAR, true, \
|
||||
"Enable Large Bar if supported by the device") \
|
||||
release(bool, ROC_CPU_WAIT_FOR_SIGNAL, true, \
|
||||
"Enable CPU wait for dependent HSA signals.") \
|
||||
release(bool, ROC_SYSTEM_SCOPE_SIGNAL, true, \
|
||||
"Enable system scope for signals (uses interrupts).") \
|
||||
release(bool, GPU_FORCE_QUEUE_PROFILING, false, \
|
||||
"Force command queue profiling by default") \
|
||||
release(bool, HIP_MEM_POOL_SUPPORT, true, \
|
||||
"Enables memory pool support in HIP") \
|
||||
release(bool, HIP_MEM_POOL_USE_VM, true, \
|
||||
"Enables memory pool support in HIP") \
|
||||
release(bool, DEBUG_HIP_MEM_POOL_VMHEAP, true, \
|
||||
"Enables virtual memory for memory pools") \
|
||||
release(bool, PAL_HIP_IPC_FLAG, true, \
|
||||
"Enable interprocess flag for device allocation in PAL HIP") \
|
||||
release(uint, PAL_FORCE_ASIC_REVISION, 0, \
|
||||
"Force a specific asic revision for all devices") \
|
||||
release(bool, PAL_EMBED_KERNEL_MD, false, \
|
||||
"Enables writing kernel metadata into command buffers.") \
|
||||
release(cstring, ROC_GLOBAL_CU_MASK, "", \
|
||||
"Sets a global CU mask (entered as hex value) for all queues," \
|
||||
"Each active bit represents using one CU (e.g., 0xf enables only 4 CUs)") \
|
||||
release(size_t, PAL_PREPINNED_MEMORY_SIZE, 64, \
|
||||
"Size in KBytes of prepinned memory") \
|
||||
release(bool, AMD_CPU_AFFINITY, false, \
|
||||
"Reset CPU affinity of any runtime threads") \
|
||||
release(bool, ROC_USE_FGS_KERNARG, true, \
|
||||
"Use fine grain kernel args segment for supported asics") \
|
||||
release(uint, ROC_P2P_SDMA_SIZE, 1024, \
|
||||
"The minimum size in KB for P2P transfer with SDMA") \
|
||||
release(uint, ROC_AQL_QUEUE_SIZE, 16384, \
|
||||
"AQL queue size in AQL packets") \
|
||||
release(uint, ROC_SIGNAL_POOL_SIZE, 64, \
|
||||
"Initial size of HSA signal pool") \
|
||||
release(uint, DEBUG_CLR_LIMIT_BLIT_WG, 16, \
|
||||
"Limit the number of workgroups in blit operations") \
|
||||
release(bool, DEBUG_CLR_BLIT_KERNARG_OPT, false, \
|
||||
"Enable blit kernel arguments optimization") \
|
||||
release(bool, ROC_SKIP_KERNEL_ARG_COPY, false, \
|
||||
"If true, then runtime can skip kernel arg copy") \
|
||||
release(bool, GPU_STREAMOPS_CP_WAIT, false, \
|
||||
"Force the stream wait memory operation to wait on CP.") \
|
||||
release(bool, HIPRTC_USE_RUNTIME_UNBUNDLER, false, \
|
||||
"Set this to true to force runtime unbundler in hiprtc.") \
|
||||
release(size_t, HIP_INITIAL_DM_SIZE, 8 * Mi, \
|
||||
"Set initial heap size for device malloc.") \
|
||||
release(bool, HIP_FORCE_DEV_KERNARG, true, \
|
||||
"Force device mem for kernel args.") \
|
||||
release(bool, DEBUG_CLR_GRAPH_PACKET_CAPTURE, true, \
|
||||
"Enable/Disable graph packet capturing") \
|
||||
release(bool, GPU_DEBUG_ENABLE, false, \
|
||||
"Enables collection of extra info for debugger at some perf cost") \
|
||||
release(cstring, HIPRTC_COMPILE_OPTIONS_APPEND, "", \
|
||||
"Set compile options needed for hiprtc compilation") \
|
||||
release(cstring, HIPRTC_LINK_OPTIONS_APPEND, "", \
|
||||
"Set link options needed for hiprtc compilation") \
|
||||
release(bool, HIP_VMEM_MANAGE_SUPPORT, true, \
|
||||
"Virtual Memory Management Support") \
|
||||
release(bool, DEBUG_HIP_GRAPH_DOT_PRINT, false, \
|
||||
"Enable/Disable graph debug dot print dump") \
|
||||
release(bool, DEBUG_HIP_FORCE_ASYNC_QUEUE, false, \
|
||||
"Forces grpahs into async queue mode. DEBUG_HIP_FORCE_GRAPH_QUEUES must be 1") \
|
||||
release(uint, DEBUG_HIP_FORCE_GRAPH_QUEUES, 4, \
|
||||
"Forces the number of streams for the graph parallel execution") \
|
||||
release(uint, DEBUG_HIP_BLOCK_SYNC, 50, \
|
||||
"Blocks synchronization on CPU until the callback processing is done")\
|
||||
release(uint, DEBUG_CLR_MAX_BATCH_SIZE, 1000, \
|
||||
"Forces the callback to clean-up CPU submission queue") \
|
||||
release(bool, DEBUG_CLR_SYSMEM_POOL, false, \
|
||||
"Use sysmem pool implementation in runtime for amd commands") \
|
||||
release(bool, DEBUG_HIP_KERNARG_COPY_OPT, true, \
|
||||
"Enable/Disable multiple kern arg copies") \
|
||||
release(bool, DEBUG_CLR_KERNARG_HDP_FLUSH_WA, false, \
|
||||
"Toggle kernel arg copy workaround") \
|
||||
release(bool, DEBUG_HIP_DYNAMIC_QUEUES, false, \
|
||||
"Forces dynamic queue management") \
|
||||
release(uint, HIP_SKIP_ABORT_ON_GPU_ERROR, true, \
|
||||
"Set this to true, to avoid host side abort for GPU errors") \
|
||||
release(bool, HIP_FORCE_SPIRV_CODEOBJECT, false, \
|
||||
"Force use of SPIRV instead of device specific code object.") \
|
||||
release(uint, DEBUG_CLR_BATCH_CPU_SYNC_SIZE, 8, \
|
||||
"Forces the minimum batch size for CPU sync") // clang-format on
|
||||
|
||||
namespace amd {
|
||||
|
||||
|
||||
@@ -256,7 +256,7 @@ inline float half2float(const uint16_t Val) {
|
||||
uint32_t signBit = ((uint32_t)(Val & 0x8000)) << signBitShift;
|
||||
uint32_t exponent = (Val & halfExpoentMask) >> 10;
|
||||
uint32_t fraction = ((uint32_t)(Val & halfFractionMask))
|
||||
<< 13; // Aligning half fraction to float
|
||||
<< 13; // Aligning half fraction to float
|
||||
union {
|
||||
uint32_t u32Arg;
|
||||
float fArg;
|
||||
@@ -283,7 +283,7 @@ inline float half2float(const uint16_t Val) {
|
||||
}
|
||||
}
|
||||
uint32_t floatExponent = ((exponent + floatExponentBias - halfExponentBias) & 0xff)
|
||||
<< floatExponentShift;
|
||||
<< floatExponentShift;
|
||||
u32Arg = signBit | floatExponent | fraction;
|
||||
return fArg;
|
||||
}
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
Language: Cpp
|
||||
BasedOnStyle: Google
|
||||
AlignEscapedNewlinesLeft: false
|
||||
AlignOperands: false
|
||||
AlignOperands: Align
|
||||
ColumnLimit: 100
|
||||
AlwaysBreakTemplateDeclarations: false
|
||||
BreakTemplateDeclarations: No
|
||||
DerivePointerAlignment: false
|
||||
IndentFunctionDeclarationAfterType: false
|
||||
MaxEmptyLinesToKeep: 2
|
||||
SortIncludes: false
|
||||
SortIncludes: Never
|
||||
|
||||
変更されたファイルが多すぎるため、一部のファイルは表示されません さらに表示
新しいイシューから参照
ユーザーをブロックする