SWDEV-548892 - Stop using __ockl_lane_id (#2186)
__lane_id already exists and is identical.
这个提交包含在:
@@ -640,7 +640,7 @@ __device__ inline void __named_sync() { __builtin_amdgcn_s_barrier(); }
|
||||
|
||||
// hip.amdgcn.bc - lanemask
|
||||
__device__ inline __hip_uint64_t __lanemask_gt() {
|
||||
__hip_uint32_t lane = __ockl_lane_u32();
|
||||
__hip_uint32_t lane = __lane_id();
|
||||
if (lane == 63) return 0;
|
||||
__hip_uint64_t ballot = __ballot64(1);
|
||||
__hip_uint64_t mask = (~((__hip_uint64_t)0)) << (lane + 1);
|
||||
@@ -648,14 +648,14 @@ __device__ inline __hip_uint64_t __lanemask_gt() {
|
||||
}
|
||||
|
||||
__device__ inline __hip_uint64_t __lanemask_lt() {
|
||||
__hip_uint32_t lane = __ockl_lane_u32();
|
||||
__hip_uint32_t lane = __lane_id();
|
||||
__hip_int64_t ballot = __ballot64(1);
|
||||
__hip_uint64_t mask = ((__hip_uint64_t)1 << lane) - (__hip_uint64_t)1;
|
||||
return mask & ballot;
|
||||
}
|
||||
|
||||
__device__ inline __hip_uint64_t __lanemask_eq() {
|
||||
__hip_uint32_t lane = __ockl_lane_u32();
|
||||
__hip_uint32_t lane = __lane_id();
|
||||
__hip_int64_t mask = ((__hip_uint64_t)1 << lane);
|
||||
return mask;
|
||||
}
|
||||
|
||||
@@ -357,7 +357,7 @@ __device__ inline T __reduce_op_sync(MaskT mask, T val, BinaryOp op, WfReduce wf
|
||||
#endif
|
||||
|
||||
firstLane = __builtin_ctzll(mask);
|
||||
laneId = __ockl_lane_u32();
|
||||
laneId = __lane_id();
|
||||
nextBit = laneId;
|
||||
// the number of iterations needs to be at least log2(number of bits on)
|
||||
numIterations = sizeof(int) * 8 - __clz(maskNumBits);
|
||||
|
||||
@@ -91,11 +91,13 @@ __global__ void reduceAllAtomics(T* __restrict__ output, const T* __restrict__ i
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (mask & (1ul << __ockl_lane_u32())) op(&result[numWarp], input[idx]);
|
||||
uint lane = __lane_id();
|
||||
|
||||
if (mask & (1ul << lane)) op(&result[numWarp], input[idx]);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (__ockl_lane_u32() == 0) output[idx / warpSize] = result[numWarp];
|
||||
if (lane == 0) output[idx / warpSize] = result[numWarp];
|
||||
}
|
||||
|
||||
template <class T, template <typename> class Op>
|
||||
@@ -104,7 +106,7 @@ __global__ void reduceOpSync(T* __restrict__ output, const T* __restrict__ input
|
||||
int idx = threadIdx.x + blockIdx.x * kBlockDim;
|
||||
T result;
|
||||
|
||||
if (mask & (1ul << __ockl_lane_u32())) {
|
||||
if (mask & (1ul << __lane_id())) {
|
||||
if constexpr (std::is_same<Op<T>, std::plus<T>>::value)
|
||||
result = __reduce_add_sync(mask, input[idx]);
|
||||
else if constexpr (std::is_same<Op<T>, MinOp<T>>::value)
|
||||
|
||||
在新工单中引用
屏蔽一个用户