diff --git a/bin/hip_embed_pch.sh b/bin/hip_embed_pch.sh index 79f07416e8..5606ec8522 100755 --- a/bin/hip_embed_pch.sh +++ b/bin/hip_embed_pch.sh @@ -24,10 +24,6 @@ cat >$tmp/hip_macros.h <$tmp/hip_pch.h <$tmp/hip_pch.h <$tmp/hip_pch.mcin <\s]+)\s+(\w+)\s*\[\s*\]\s*;/HIP_DYNAMIC_SHARED($1 $2, $3)/g; - $ft{'extern_shared'} += $k; -} - # CUDA Kernel Launch Syntax sub transformKernelLaunch { no warnings qw/uninitialized/; diff --git a/include/hip/amd_detail/device_functions.h b/include/hip/amd_detail/device_functions.h index 320fbc7422..273912256b 100644 --- a/include/hip/amd_detail/device_functions.h +++ b/include/hip/amd_detail/device_functions.h @@ -1269,14 +1269,6 @@ unsigned __smid(void) return (se_id << HW_ID_CU_ID_SIZE) + cu_id; } -// Macro to replace extern __shared__ declarations -// to local variable definitions -#define HIP_DYNAMIC_SHARED(type, var) \ - type* var = (type*)__amdgcn_get_dynamicgroupbaseptr(); - -#define HIP_DYNAMIC_SHARED_ATTRIBUTE - - #endif //defined(__clang__) && defined(__HIP__) diff --git a/include/hip/amd_detail/hip_runtime_api.h b/include/hip/amd_detail/hip_runtime_api.h index a8f7792d27..6301251a98 100644 --- a/include/hip/amd_detail/hip_runtime_api.h +++ b/include/hip/amd_detail/hip_runtime_api.h @@ -3065,8 +3065,8 @@ hipError_t hipModuleLoadDataEx(hipModule_t* module, const void* image, unsigned * @param [in] blockDimX X block dimensions specified in work-items * @param [in] blockDimY Y grid dimension specified in work-items * @param [in] blockDimZ Z grid dimension specified in work-items - * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The - * kernel can access this with HIP_DYNAMIC_SHARED. + * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The + * HIP-Clang compiler provides support for extern shared declarations. * @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case th * default stream is used with associated synchronization rules. * @param [in] kernelParams @@ -3092,8 +3092,8 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX, unsigne * @param [in] gridDim Grid dimensions specified as multiple of blockDim. * @param [in] blockDim Block dimensions specified in work-items * @param [in] kernelParams A list of kernel arguments - * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The - * kernel can access this with HIP_DYNAMIC_SHARED. + * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The + * HIP-Clang compiler provides support for extern shared declarations. * @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case th * default stream is used with associated synchronization rules. * @@ -3294,8 +3294,8 @@ hipError_t hipProfilerStop(); * * @param [in] gridDim grid dimension specified as multiple of blockDim. * @param [in] blockDim block dimensions specified in work-items - * @param [in] sharedMem Amount of dynamic shared memory to allocate for this kernel. The - * kernel can access this with HIP_DYNAMIC_SHARED. + * @param [in] sharedMem Amount of dynamic shared memory to allocate for this kernel. The + * HIP-Clang compiler provides support for extern shared declarations. * @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case the * default stream is used with associated synchronization rules. * @@ -3334,8 +3334,8 @@ hipError_t hipLaunchByPtr(const void* func); * * @param [in] gridDim grid dimension specified as multiple of blockDim. * @param [in] blockDim block dimensions specified in work-items - * @param [in] sharedMem Amount of dynamic shared memory to allocate for this kernel. The - * kernel can access this with HIP_DYNAMIC_SHARED. + * @param [in] sharedMem Amount of dynamic shared memory to allocate for this kernel. The + * HIP-Clang compiler provides support for extern shared declarations. * @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case the * default stream is used with associated synchronization rules. * @@ -3354,7 +3354,7 @@ hipError_t __hipPushCallConfiguration(dim3 gridDim, * @param [out] gridDim grid dimension specified as multiple of blockDim. * @param [out] blockDim block dimensions specified in work-items * @param [out] sharedMem Amount of dynamic shared memory to allocate for this kernel. The - * kernel can access this with HIP_DYNAMIC_SHARED. + * HIP-Clang compiler provides support for extern shared declarations. * @param [out] stream Stream where the kernel should be dispatched. May be 0, in which case the * default stream is used with associated synchronization rules. * @@ -3373,8 +3373,8 @@ hipError_t __hipPopCallConfiguration(dim3 *gridDim, * @param [in] numBlocks - number of blocks * @param [in] dimBlocks - dimension of a block * @param [in] args - kernel arguments - * @param [in] sharedMemBytes - Amount of dynamic shared memory to allocate for this kernel. The - * Kernel can access this with HIP_DYNAMIC_SHARED. + * @param [in] sharedMemBytes - Amount of dynamic shared memory to allocate for this kernel. The + * HIP-Clang compiler provides support for extern shared declarations. * @param [in] stream - Stream where the kernel should be dispatched. May be 0, in which case th * default stream is used with associated synchronization rules. * diff --git a/include/hip/hip_ext.h b/include/hip/hip_ext.h index 4c6148970d..a60d63b305 100644 --- a/include/hip/hip_ext.h +++ b/include/hip/hip_ext.h @@ -43,7 +43,7 @@ THE SOFTWARE. * @param [in] blockDimY Y grid dimension specified in work-items * @param [in] blockDimZ Z grid dimension specified in work-items * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The - kernel can access this with HIP_DYNAMIC_SHARED. + HIP-Clang compiler provides support for extern shared declarations * @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case th default stream is used with associated synchronization rules. * @param [in] kernelParams diff --git a/include/hip/nvidia_detail/hip_runtime.h b/include/hip/nvidia_detail/hip_runtime.h index 84414fb4a3..dfe41cf3c3 100644 --- a/include/hip/nvidia_detail/hip_runtime.h +++ b/include/hip/nvidia_detail/hip_runtime.h @@ -95,14 +95,6 @@ typedef int hipLaunchParm; #define HIP_SYMBOL(X) &X -/** - * extern __shared__ - */ - -#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[]; - -#define HIP_DYNAMIC_SHARED_ATTRIBUTE - #ifdef __HIP_DEVICE_COMPILE__ #define abort_() \ { asm("trap;"); } diff --git a/samples/2_Cookbook/6_dynamic_shared/Readme.md b/samples/2_Cookbook/6_dynamic_shared/Readme.md index 68782807bf..02a8cb3da2 100644 --- a/samples/2_Cookbook/6_dynamic_shared/Readme.md +++ b/samples/2_Cookbook/6_dynamic_shared/Readme.md @@ -19,10 +19,13 @@ We will be using the Simple Matrix Transpose application from the previous tutor ## Shared Memory -Shared memory is way more faster than that of global and constant memory and accessible to all the threads in the block. For In the same sourcecode, we will use the `HIP_DYNAMIC_SHARED` keyword to declare dynamic shared memory as follows: +Shared memory is way more faster than that of global and constant memory and accessible to all the threads in the block. + +Previously, it was essential to declare dynamic shared memory using the HIP_DYNAMIC_SHARED macro for accuracy, as using static shared memory in the same kernel could result in overlapping memory ranges and data-races. + +Now, the HIP-Clang compiler provides support for extern shared declarations, and the HIP_DYNAMIC_SHARED option is no longer required. You may use the standard extern definition: +extern __shared__ type var[]; -` HIP_DYNAMIC_SHARED(float, sharedMem) ` -here the first parameter is the data type while the second one is the variable name. The other important change is: ``` diff --git a/samples/2_Cookbook/6_dynamic_shared/dynamic_shared.cpp b/samples/2_Cookbook/6_dynamic_shared/dynamic_shared.cpp index 743da8f63c..65922afc86 100644 --- a/samples/2_Cookbook/6_dynamic_shared/dynamic_shared.cpp +++ b/samples/2_Cookbook/6_dynamic_shared/dynamic_shared.cpp @@ -35,8 +35,7 @@ THE SOFTWARE. // Device (Kernel) function, it must be void __global__ void matrixTranspose(float* out, float* in, const int width) { - // declare dynamic shared memory - HIP_DYNAMIC_SHARED(float, sharedMem); + extern __shared__ float sharedMem[]; int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; diff --git a/samples/2_Cookbook/7_streams/stream.cpp b/samples/2_Cookbook/7_streams/stream.cpp index dd87e4363e..f2bf289c9d 100644 --- a/samples/2_Cookbook/7_streams/stream.cpp +++ b/samples/2_Cookbook/7_streams/stream.cpp @@ -49,8 +49,7 @@ __global__ void matrixTranspose_static_shared(float* out, float* in, __global__ void matrixTranspose_dynamic_shared(float* out, float* in, const int width) { - // declare dynamic shared memory - HIP_DYNAMIC_SHARED(float, sharedMem) + extern __shared__ float sharedMem[]; int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; diff --git a/samples/2_Cookbook/8_peer2peer/peer2peer.cpp b/samples/2_Cookbook/8_peer2peer/peer2peer.cpp index 8e011b1281..1f61e65876 100644 --- a/samples/2_Cookbook/8_peer2peer/peer2peer.cpp +++ b/samples/2_Cookbook/8_peer2peer/peer2peer.cpp @@ -122,8 +122,7 @@ __global__ void matrixTranspose_static_shared(float* out, float* in, __global__ void matrixTranspose_dynamic_shared(float* out, float* in, const int width) { - // declare dynamic shared memory - HIP_DYNAMIC_SHARED(float, sharedMem) + extern __shared__ float sharedMem[]; int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; diff --git a/tests/src/kernel/hipDynamicShared.cpp b/tests/src/kernel/hipDynamicShared.cpp index cc147e550f..579432bf3d 100644 --- a/tests/src/kernel/hipDynamicShared.cpp +++ b/tests/src/kernel/hipDynamicShared.cpp @@ -49,12 +49,7 @@ template __global__ void testExternSharedKernel(const T* A_d, const T* B_d, T* C_d, size_t numElements, size_t groupElements) { // declare dynamic shared memory -#if defined(__HIP_PLATFORM_AMD__) - HIP_DYNAMIC_SHARED(T, sdata) -#else - HIP_DYNAMIC_SHARED(__align__(sizeof(T)) unsigned char, my_sdata) - T* sdata = reinterpret_cast(my_sdata); -#endif + extern __shared__ double sdata[]; size_t gid = (blockIdx.x * blockDim.x + threadIdx.x); size_t tid = threadIdx.x; diff --git a/tests/src/kernel/hipDynamicShared2.cpp b/tests/src/kernel/hipDynamicShared2.cpp index b36e75346d..f92a89b844 100644 --- a/tests/src/kernel/hipDynamicShared2.cpp +++ b/tests/src/kernel/hipDynamicShared2.cpp @@ -33,7 +33,7 @@ THE SOFTWARE. #define SIZE LEN * 4 __global__ void vectorAdd(float* Ad, float* Bd) { - HIP_DYNAMIC_SHARED(float, sBd); + extern __shared__ float sBd[]; int tx = threadIdx.x; for (int i = 0; i < LEN / 64; i++) { sBd[tx + i * 64] = Ad[tx + i * 64] + 1.0f;