diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_device_functions.h b/projects/clr/hipamd/include/hip/amd_detail/amd_device_functions.h index e28c2cbb3c..b748835fa9 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_device_functions.h +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_device_functions.h @@ -640,7 +640,7 @@ __device__ inline void __named_sync() { __builtin_amdgcn_s_barrier(); } // hip.amdgcn.bc - lanemask __device__ inline __hip_uint64_t __lanemask_gt() { - __hip_uint32_t lane = __ockl_lane_u32(); + __hip_uint32_t lane = __lane_id(); if (lane == 63) return 0; __hip_uint64_t ballot = __ballot64(1); __hip_uint64_t mask = (~((__hip_uint64_t)0)) << (lane + 1); @@ -648,14 +648,14 @@ __device__ inline __hip_uint64_t __lanemask_gt() { } __device__ inline __hip_uint64_t __lanemask_lt() { - __hip_uint32_t lane = __ockl_lane_u32(); + __hip_uint32_t lane = __lane_id(); __hip_int64_t ballot = __ballot64(1); __hip_uint64_t mask = ((__hip_uint64_t)1 << lane) - (__hip_uint64_t)1; return mask & ballot; } __device__ inline __hip_uint64_t __lanemask_eq() { - __hip_uint32_t lane = __ockl_lane_u32(); + __hip_uint32_t lane = __lane_id(); __hip_int64_t mask = ((__hip_uint64_t)1 << lane); return mask; } diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h b/projects/clr/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h index aaad1827e2..0c05050af1 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h @@ -357,7 +357,7 @@ __device__ inline T __reduce_op_sync(MaskT mask, T val, BinaryOp op, WfReduce wf #endif firstLane = __builtin_ctzll(mask); - laneId = __ockl_lane_u32(); + laneId = __lane_id(); nextBit = laneId; // the number of iterations needs to be at least log2(number of bits on) numIterations = sizeof(int) * 8 - __clz(maskNumBits); diff --git a/projects/hip-tests/catch/performance/warpSync/warpSync.cc b/projects/hip-tests/catch/performance/warpSync/warpSync.cc index e6a16325c2..59321adea3 100644 --- a/projects/hip-tests/catch/performance/warpSync/warpSync.cc +++ b/projects/hip-tests/catch/performance/warpSync/warpSync.cc @@ -91,11 +91,13 @@ __global__ void reduceAllAtomics(T* __restrict__ output, const T* __restrict__ i __syncthreads(); - if (mask & (1ul << __ockl_lane_u32())) op(&result[numWarp], input[idx]); + uint lane = __lane_id(); + + if (mask & (1ul << lane)) op(&result[numWarp], input[idx]); __syncthreads(); - if (__ockl_lane_u32() == 0) output[idx / warpSize] = result[numWarp]; + if (lane == 0) output[idx / warpSize] = result[numWarp]; } template class Op> @@ -104,7 +106,7 @@ __global__ void reduceOpSync(T* __restrict__ output, const T* __restrict__ input int idx = threadIdx.x + blockIdx.x * kBlockDim; T result; - if (mask & (1ul << __ockl_lane_u32())) { + if (mask & (1ul << __lane_id())) { if constexpr (std::is_same, std::plus>::value) result = __reduce_add_sync(mask, input[idx]); else if constexpr (std::is_same, MinOp>::value)