Move hipclang funcs into corresponding headers
이 커밋은 다음에 포함됨:
@@ -24,8 +24,12 @@ THE SOFTWARE.
|
||||
#define HIP_INCLUDE_HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H
|
||||
|
||||
#include "host_defines.h"
|
||||
#include "math_fwd.h"
|
||||
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <hip/hip_vector_types.h>
|
||||
#include <hip/hcc_detail/device_library_decls.h>
|
||||
#include <hip/hcc_detail/llvm_intrinsics.h>
|
||||
|
||||
extern "C" __device__ unsigned int __hip_hc_ir_umul24_int(unsigned int, unsigned int);
|
||||
extern "C" __device__ signed int __hip_hc_ir_mul24_int(signed int, signed int);
|
||||
@@ -209,5 +213,257 @@ __device__ char4 __hip_hc_add8pk(char4, char4);
|
||||
__device__ char4 __hip_hc_sub8pk(char4, char4);
|
||||
__device__ char4 __hip_hc_mul8pk(char4, char4);
|
||||
|
||||
#if defined(__HCC__)
|
||||
#define __HCC_OR_HIP_CLANG__ 1
|
||||
#elif defined(__clang__) && defined(__HIP__)
|
||||
#define __HCC_OR_HIP_CLANG__ 1
|
||||
#else
|
||||
#define __HCC_OR_HIP_CLANG__ 0
|
||||
#endif
|
||||
|
||||
#ifdef __HCC_OR_HIP_CLANG__
|
||||
|
||||
#ifdef __HIP_DEVICE_COMPILE__
|
||||
|
||||
// Clock functions
|
||||
__device__
|
||||
inline
|
||||
long long int __clock64() { return (long long int) __builtin_amdgcn_s_memrealtime(); }
|
||||
|
||||
__device__
|
||||
inline
|
||||
long long int __clock() { return (long long int) __builtin_amdgcn_s_memrealtime(); }
|
||||
|
||||
// hip.amdgcn.bc - named sync
|
||||
__device__
|
||||
inline
|
||||
void __named_sync(int a, int b) { __builtin_amdgcn_s_barrier(); }
|
||||
|
||||
#endif // __HIP_DEVICE_COMPILE__
|
||||
|
||||
// warp vote function __all __any __ballot
|
||||
__device__
|
||||
int __all(int input);
|
||||
__device__
|
||||
int __any(int input);
|
||||
__device__
|
||||
unsigned long long int __ballot(int input);
|
||||
|
||||
__device__
|
||||
inline
|
||||
uint64_t __ballot64(int a) {
|
||||
int64_t s;
|
||||
// define i64 @__ballot64(i32 %a) #0 {
|
||||
// %b = tail call i64 asm "v_cmp_ne_i32_e64 $0, 0, $1", "=s,v"(i32 %a) #1
|
||||
// ret i64 %b
|
||||
// }
|
||||
__asm("v_cmp_ne_i32_e64 $0, 0, $1" : "=s"(s) : "v"(a));
|
||||
return s;
|
||||
}
|
||||
|
||||
// hip.amdgcn.bc - lanemask
|
||||
__device__
|
||||
inline
|
||||
int64_t __lanemask_gt()
|
||||
{
|
||||
int32_t activelane = __ockl_activelane_u32();
|
||||
int64_t ballot = __ballot64(1);
|
||||
if (activelane != 63) {
|
||||
int64_t tmp = (~0UL) << (activelane + 1);
|
||||
return tmp & ballot;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int64_t __lanemask_lt()
|
||||
{
|
||||
int32_t activelane = __ockl_activelane_u32();
|
||||
int64_t ballot = __ballot64(1);
|
||||
if (activelane == 0)
|
||||
return 0;
|
||||
return ballot;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
void* __get_dynamicgroupbaseptr()
|
||||
{
|
||||
// Get group segment base pointer.
|
||||
return (char*)__local_to_generic(__to_local(__llvm_amdgcn_groupstaticsize()));
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
void *__amdgcn_get_dynamicgroupbaseptr() {
|
||||
return __get_dynamicgroupbaseptr();
|
||||
}
|
||||
|
||||
#endif // __HCC_OR_HIP_CLANG__
|
||||
|
||||
#ifdef __HCC__
|
||||
|
||||
/**
|
||||
* extern __shared__
|
||||
*/
|
||||
|
||||
// Macro to replace extern __shared__ declarations
|
||||
// to local variable definitions
|
||||
#define HIP_DYNAMIC_SHARED(type, var) type* var = (type*)__get_dynamicgroupbaseptr();
|
||||
|
||||
#define HIP_DYNAMIC_SHARED_ATTRIBUTE
|
||||
|
||||
|
||||
#elif defined(__clang__) && defined(__HIP__)
|
||||
|
||||
#pragma push_macro("__DEVICE__")
|
||||
#define __DEVICE__ extern "C" __device__ __attribute__((always_inline)) \
|
||||
__attribute__((weak))
|
||||
|
||||
__DEVICE__
|
||||
inline
|
||||
void __assert_fail(const char * __assertion,
|
||||
const char *__file,
|
||||
unsigned int __line,
|
||||
const char *__function)
|
||||
{
|
||||
// Ignore all the args for now.
|
||||
__builtin_trap();
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline
|
||||
void __assertfail(const char * __assertion,
|
||||
const char *__file,
|
||||
unsigned int __line,
|
||||
const char *__function,
|
||||
size_t charsize)
|
||||
{
|
||||
// ignore all the args for now.
|
||||
__builtin_trap();
|
||||
}
|
||||
|
||||
// hip.amdgcn.bc - sync threads
|
||||
#define __CLK_LOCAL_MEM_FENCE 0x01
|
||||
typedef unsigned __cl_mem_fence_flags;
|
||||
|
||||
typedef enum __memory_scope {
|
||||
__memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
|
||||
__memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
|
||||
__memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
|
||||
__memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
|
||||
__memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
|
||||
} __memory_scope;
|
||||
|
||||
// enum values aligned with what clang uses in EmitAtomicExpr()
|
||||
typedef enum __memory_order
|
||||
{
|
||||
__memory_order_relaxed = __ATOMIC_RELAXED,
|
||||
__memory_order_acquire = __ATOMIC_ACQUIRE,
|
||||
__memory_order_release = __ATOMIC_RELEASE,
|
||||
__memory_order_acq_rel = __ATOMIC_ACQ_REL,
|
||||
__memory_order_seq_cst = __ATOMIC_SEQ_CST
|
||||
} __memory_order;
|
||||
|
||||
__device__
|
||||
inline
|
||||
static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope)
|
||||
{
|
||||
if (flags) {
|
||||
switch(scope) {
|
||||
case __memory_scope_work_item: break;
|
||||
case __memory_scope_sub_group: __llvm_fence_rel_sg(); break;
|
||||
case __memory_scope_work_group: __llvm_fence_rel_wg(); break;
|
||||
case __memory_scope_device: __llvm_fence_rel_dev(); break;
|
||||
case __memory_scope_all_svm_devices: __llvm_fence_rel_sys(); break;
|
||||
}
|
||||
//atomic_work_item_fence(flags, memory_order_release, scope);
|
||||
__builtin_amdgcn_s_barrier();
|
||||
//atomic_work_item_fence(flags, memory_order_acquire, scope);
|
||||
switch(scope) {
|
||||
case __memory_scope_work_item: break;
|
||||
case __memory_scope_sub_group: __llvm_fence_acq_sg(); break;
|
||||
case __memory_scope_work_group: __llvm_fence_acq_wg(); break;
|
||||
case __memory_scope_device: __llvm_fence_acq_dev(); break;
|
||||
case __memory_scope_all_svm_devices: __llvm_fence_acq_sys(); break;
|
||||
}
|
||||
} else {
|
||||
__builtin_amdgcn_s_barrier();
|
||||
}
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
static void __barrier(int n)
|
||||
{
|
||||
__work_group_barrier((__cl_mem_fence_flags)n, __memory_scope_work_group);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
__attribute__((noduplicate))
|
||||
void __syncthreads()
|
||||
{
|
||||
__barrier(__CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
// hip.amdgcn.bc - device routine
|
||||
/*
|
||||
HW_ID Register bit structure
|
||||
WAVE_ID 3:0 Wave buffer slot number. 0-9.
|
||||
SIMD_ID 5:4 SIMD which the wave is assigned to within the CU.
|
||||
PIPE_ID 7:6 Pipeline from which the wave was dispatched.
|
||||
CU_ID 11:8 Compute Unit the wave is assigned to.
|
||||
SH_ID 12 Shader Array (within an SE) the wave is assigned to.
|
||||
SE_ID 14:13 Shader Engine the wave is assigned to.
|
||||
TG_ID 19:16 Thread-group ID
|
||||
VM_ID 23:20 Virtual Memory ID
|
||||
QUEUE_ID 26:24 Queue from which this wave was dispatched.
|
||||
STATE_ID 29:27 State ID (graphics only, not compute).
|
||||
ME_ID 31:30 Micro-engine ID.
|
||||
*/
|
||||
|
||||
#define HW_ID 4
|
||||
|
||||
#define HW_ID_CU_ID_SIZE 4
|
||||
#define HW_ID_CU_ID_OFFSET 8
|
||||
|
||||
#define HW_ID_SE_ID_SIZE 2
|
||||
#define HW_ID_SE_ID_OFFSET 13
|
||||
|
||||
/*
|
||||
Encoding of parameter bitmask
|
||||
HW_ID 5:0 HW_ID
|
||||
OFFSET 10:6 Range: 0..31
|
||||
SIZE 15:11 Range: 1..32
|
||||
*/
|
||||
|
||||
#define GETREG_IMMED(SZ,OFF,REG) (SZ << 11) | (OFF << 6) | REG
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned __smid(void)
|
||||
{
|
||||
unsigned cu_id = __builtin_amdgcn_s_getreg(
|
||||
GETREG_IMMED(HW_ID_CU_ID_SIZE, HW_ID_CU_ID_OFFSET, HW_ID));
|
||||
unsigned se_id = __builtin_amdgcn_s_getreg(
|
||||
GETREG_IMMED(HW_ID_SE_ID_SIZE, HW_ID_SE_ID_OFFSET, HW_ID));
|
||||
|
||||
/* Each shader engine has 16 CU */
|
||||
return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
|
||||
}
|
||||
|
||||
#pragma push_macro("__DEVICE__")
|
||||
|
||||
// Macro to replace extern __shared__ declarations
|
||||
// to local variable definitions
|
||||
#define HIP_DYNAMIC_SHARED(type, var) \
|
||||
type* var = (type*)__amdgcn_get_dynamicgroupbaseptr();
|
||||
|
||||
#define HIP_DYNAMIC_SHARED_ATTRIBUTE
|
||||
|
||||
|
||||
#endif //defined(__clang__) && defined(__HIP__)
|
||||
|
||||
#endif
|
||||
|
||||
@@ -62,8 +62,6 @@ THE SOFTWARE.
|
||||
#define CUDA_SUCCESS hipSuccess
|
||||
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <hip/hcc_detail/device_library_decls.h>
|
||||
#include <hip/hcc_detail/llvm_intrinsics.h>
|
||||
#endif // __HCC_OR_HIP_CLANG__
|
||||
|
||||
#if __HCC__
|
||||
@@ -190,60 +188,9 @@ extern int HIP_TRACE_API;
|
||||
//#define warpSize hc::__wavesize()
|
||||
static constexpr int warpSize = 64;
|
||||
|
||||
#define clock_t long long int
|
||||
__device__
|
||||
inline
|
||||
long long int __clock64() { return (long long int)__llvm_amdgcn_s_memrealtime(); }
|
||||
|
||||
__device__
|
||||
inline
|
||||
clock_t __clock() { return (clock_t)__llvm_amdgcn_s_memrealtime(); }
|
||||
|
||||
// abort
|
||||
__device__ void abort();
|
||||
|
||||
// warp vote function __all __any __ballot
|
||||
__device__ int __all(int input);
|
||||
__device__ int __any(int input);
|
||||
__device__ unsigned long long int __ballot(int input);
|
||||
|
||||
__device__
|
||||
inline
|
||||
int64_t __ballot64(int a) {
|
||||
int64_t s;
|
||||
// define i64 @__ballot64(i32 %a) #0 {
|
||||
// %b = tail call i64 asm "v_cmp_ne_i32_e64 $0, 0, $1", "=s,v"(i32 %a) #1
|
||||
// ret i64 %b
|
||||
// }
|
||||
__asm("v_cmp_ne_i32_e64 $0, 0, $1" : "=s"(s) : "v"(a));
|
||||
return s;
|
||||
}
|
||||
|
||||
// hip.amdgcn.bc - lanemask
|
||||
__device__
|
||||
inline
|
||||
int64_t __lanemask_gt()
|
||||
{
|
||||
int32_t activelane = __ockl_activelane_u32();
|
||||
int64_t ballot = __ballot64(1);
|
||||
if (activelane != 63) {
|
||||
int64_t tmp = (~0UL) << (activelane + 1);
|
||||
return tmp & ballot;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
int64_t __lanemask_lt()
|
||||
{
|
||||
int32_t activelane = __ockl_activelane_u32();
|
||||
int64_t ballot = __ballot64(1);
|
||||
if (activelane == 0)
|
||||
return 0;
|
||||
return ballot;
|
||||
}
|
||||
|
||||
#if __HIP_ARCH_GFX701__ == 0
|
||||
|
||||
// warp shuffle functions
|
||||
@@ -283,20 +230,6 @@ __host__ __device__ int min(int arg1, int arg2);
|
||||
__host__ __device__ int max(int arg1, int arg2);
|
||||
|
||||
|
||||
__device__
|
||||
inline
|
||||
void* __get_dynamicgroupbaseptr()
|
||||
{
|
||||
// Get group segment base pointer.
|
||||
return (char*)__local_to_generic(__to_local(__llvm_amdgcn_groupstaticsize()));
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
void *__amdgcn_get_dynamicgroupbaseptr() {
|
||||
return __get_dynamicgroupbaseptr();
|
||||
}
|
||||
|
||||
/**
|
||||
* CUDA 8 device function features
|
||||
|
||||
@@ -371,9 +304,6 @@ __device__ void __threadfence_system(void);
|
||||
* @}
|
||||
*/
|
||||
|
||||
// hip.amdgcn.bc - named sync
|
||||
__device__ inline void __named_sync(int a, int b) { __llvm_amdgcn_s_barrier(); }
|
||||
|
||||
#endif // __HCC_OR_HIP_CLANG__
|
||||
|
||||
#if defined __HCC__
|
||||
@@ -496,17 +426,6 @@ extern void ihipPostLaunchKernel(const char* kernelName, hipStream_t stream, gri
|
||||
|
||||
#endif //__HCC_CPP__
|
||||
|
||||
/**
|
||||
* extern __shared__
|
||||
*/
|
||||
|
||||
// Macro to replace extern __shared__ declarations
|
||||
// to local variable definitions
|
||||
#define HIP_DYNAMIC_SHARED(type, var) type* var = (type*)__get_dynamicgroupbaseptr();
|
||||
|
||||
#define HIP_DYNAMIC_SHARED_ATTRIBUTE
|
||||
|
||||
|
||||
/**
|
||||
* @defgroup HIP-ENV HIP Environment Variables
|
||||
* @{
|
||||
@@ -625,154 +544,6 @@ extern const __device__ __attribute__((weak)) __hip_builtin_gridDim_t gridDim;
|
||||
#define hipGridDim_y gridDim.y
|
||||
#define hipGridDim_z gridDim.z
|
||||
|
||||
#pragma push_macro("__DEVICE__")
|
||||
#define __DEVICE__ extern "C" __device__ __attribute__((always_inline)) \
|
||||
__attribute__((weak))
|
||||
|
||||
__DEVICE__ void __device_trap() __asm("llvm.trap");
|
||||
|
||||
__DEVICE__
|
||||
inline
|
||||
void __assert_fail(const char * __assertion,
|
||||
const char *__file,
|
||||
unsigned int __line,
|
||||
const char *__function)
|
||||
{
|
||||
// Ignore all the args for now.
|
||||
__device_trap();
|
||||
}
|
||||
|
||||
__DEVICE__
|
||||
inline
|
||||
void __assertfail(const char * __assertion,
|
||||
const char *__file,
|
||||
unsigned int __line,
|
||||
const char *__function,
|
||||
size_t charsize)
|
||||
{
|
||||
// ignore all the args for now.
|
||||
__device_trap();
|
||||
}
|
||||
|
||||
// hip.amdgcn.bc - sync threads
|
||||
#define __CLK_LOCAL_MEM_FENCE 0x01
|
||||
typedef unsigned __cl_mem_fence_flags;
|
||||
|
||||
typedef enum __memory_scope {
|
||||
__memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
|
||||
__memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
|
||||
__memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
|
||||
__memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
|
||||
__memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
|
||||
} __memory_scope;
|
||||
|
||||
// enum values aligned with what clang uses in EmitAtomicExpr()
|
||||
typedef enum __memory_order
|
||||
{
|
||||
__memory_order_relaxed = __ATOMIC_RELAXED,
|
||||
__memory_order_acquire = __ATOMIC_ACQUIRE,
|
||||
__memory_order_release = __ATOMIC_RELEASE,
|
||||
__memory_order_acq_rel = __ATOMIC_ACQ_REL,
|
||||
__memory_order_seq_cst = __ATOMIC_SEQ_CST
|
||||
} __memory_order;
|
||||
|
||||
__device__
|
||||
inline
|
||||
static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope)
|
||||
{
|
||||
if (flags) {
|
||||
switch(scope) {
|
||||
case __memory_scope_work_item: break;
|
||||
case __memory_scope_sub_group: __llvm_fence_rel_sg(); break;
|
||||
case __memory_scope_work_group: __llvm_fence_rel_wg(); break;
|
||||
case __memory_scope_device: __llvm_fence_rel_dev(); break;
|
||||
case __memory_scope_all_svm_devices: __llvm_fence_rel_sys(); break;
|
||||
}
|
||||
//atomic_work_item_fence(flags, memory_order_release, scope);
|
||||
__builtin_amdgcn_s_barrier();
|
||||
//atomic_work_item_fence(flags, memory_order_acquire, scope);
|
||||
switch(scope) {
|
||||
case __memory_scope_work_item: break;
|
||||
case __memory_scope_sub_group: __llvm_fence_acq_sg(); break;
|
||||
case __memory_scope_work_group: __llvm_fence_acq_wg(); break;
|
||||
case __memory_scope_device: __llvm_fence_acq_dev(); break;
|
||||
case __memory_scope_all_svm_devices: __llvm_fence_acq_sys(); break;
|
||||
}
|
||||
} else {
|
||||
__builtin_amdgcn_s_barrier();
|
||||
}
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
static void __barrier(int n)
|
||||
{
|
||||
__work_group_barrier((__cl_mem_fence_flags)n, __memory_scope_work_group);
|
||||
}
|
||||
|
||||
__device__
|
||||
inline
|
||||
__attribute__((noduplicate))
|
||||
void __syncthreads()
|
||||
{
|
||||
__barrier(__CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
// hip.amdgcn.bc - device routine
|
||||
/*
|
||||
HW_ID Register bit structure
|
||||
WAVE_ID 3:0 Wave buffer slot number. 0-9.
|
||||
SIMD_ID 5:4 SIMD which the wave is assigned to within the CU.
|
||||
PIPE_ID 7:6 Pipeline from which the wave was dispatched.
|
||||
CU_ID 11:8 Compute Unit the wave is assigned to.
|
||||
SH_ID 12 Shader Array (within an SE) the wave is assigned to.
|
||||
SE_ID 14:13 Shader Engine the wave is assigned to.
|
||||
TG_ID 19:16 Thread-group ID
|
||||
VM_ID 23:20 Virtual Memory ID
|
||||
QUEUE_ID 26:24 Queue from which this wave was dispatched.
|
||||
STATE_ID 29:27 State ID (graphics only, not compute).
|
||||
ME_ID 31:30 Micro-engine ID.
|
||||
*/
|
||||
|
||||
#define HW_ID 4
|
||||
|
||||
#define HW_ID_CU_ID_SIZE 4
|
||||
#define HW_ID_CU_ID_OFFSET 8
|
||||
|
||||
#define HW_ID_SE_ID_SIZE 2
|
||||
#define HW_ID_SE_ID_OFFSET 13
|
||||
|
||||
/*
|
||||
Encoding of parameter bitmask
|
||||
HW_ID 5:0 HW_ID
|
||||
OFFSET 10:6 Range: 0..31
|
||||
SIZE 15:11 Range: 1..32
|
||||
*/
|
||||
|
||||
#define GETREG_IMMED(SZ,OFF,REG) (SZ << 11) | (OFF << 6) | REG
|
||||
|
||||
__device__
|
||||
inline
|
||||
unsigned __smid(void)
|
||||
{
|
||||
unsigned cu_id = __llvm_amdgcn_s_getreg(
|
||||
GETREG_IMMED(HW_ID_CU_ID_SIZE, HW_ID_CU_ID_OFFSET, HW_ID));
|
||||
unsigned se_id = __llvm_amdgcn_s_getreg(
|
||||
GETREG_IMMED(HW_ID_SE_ID_SIZE, HW_ID_SE_ID_OFFSET, HW_ID));
|
||||
|
||||
/* Each shader engine has 16 CU */
|
||||
return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
|
||||
}
|
||||
|
||||
// Macro to replace extern __shared__ declarations
|
||||
// to local variable definitions
|
||||
#define HIP_DYNAMIC_SHARED(type, var) \
|
||||
type* var = (type*)__amdgcn_get_dynamicgroupbaseptr();
|
||||
|
||||
#define HIP_DYNAMIC_SHARED_ATTRIBUTE
|
||||
|
||||
#pragma push_macro("__DEVICE__")
|
||||
|
||||
#include <hip/hcc_detail/math_functions.h>
|
||||
|
||||
#endif
|
||||
|
||||
@@ -31,16 +31,7 @@ THE SOFTWARE.
|
||||
|
||||
#include "hip/hcc_detail/host_defines.h"
|
||||
|
||||
__device__
|
||||
unsigned long __llvm_amdgcn_s_memrealtime(void) __asm("llvm.amdgcn.s.memrealtime");
|
||||
|
||||
__device__
|
||||
unsigned __llvm_amdgcn_s_getreg(unsigned) __asm("llvm.amdgcn.s.getreg");
|
||||
|
||||
__device__
|
||||
unsigned __llvm_amdgcn_groupstaticsize() __asm("llvm.amdgcn.groupstaticsize");
|
||||
|
||||
__device__
|
||||
void __llvm_amdgcn_s_barrier() __asm("llvm.amdgcn.s.barrier");
|
||||
|
||||
#endif
|
||||
|
||||
새 이슈에서 참조
사용자 차단