diff --git a/.github/workflows/rocprofiler-compute-formatting.yml b/.github/workflows/rocprofiler-compute-formatting.yml index 42f8351d56..0e65719eac 100644 --- a/.github/workflows/rocprofiler-compute-formatting.yml +++ b/.github/workflows/rocprofiler-compute-formatting.yml @@ -41,7 +41,7 @@ jobs: working-directory: projects/rocprofiler-compute run: | python -m pip install --upgrade pip - python -m pip install ruff + python -m pip install ruff==0.14.11 if [ -f requirements.txt ]; then python -m pip install -r requirements.txt; fi - name: Run Ruff Linter and Import Sorter run: | diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index acee11a3f1..ee822e16c6 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -226,6 +226,12 @@ class AMDSMICommands(): # Set args.* to passed in arguments if gpu: args.gpu = gpu + + cpu_attributes = ["cpu"] + for attr in cpu_attributes: + if hasattr(args, 'cpu') and getattr(args, 'cpu'): + print("N/A") + return # Handle No GPU passed if args.gpu == None: diff --git a/projects/clr/CHANGELOG.md b/projects/clr/CHANGELOG.md index 292125904d..80756af55e 100644 --- a/projects/clr/CHANGELOG.md +++ b/projects/clr/CHANGELOG.md @@ -8,6 +8,7 @@ Full documentation for HIP is available at [rocm.docs.amd.com](https://rocm.docs * New HIP APIs - `hipKernelGetParamInfo` returns the offset and size of a kernel parameter +* Support for `barrier_arrive` and `barrier_wait` for `grid_group` and `thread_block`. * New HIP supports - `grid_group::block_rank()` returns the rank of the block in the calling thread diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_bf16.h b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_bf16.h index 132abf713d..8d07119f27 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_bf16.h +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_bf16.h @@ -111,9 +111,8 @@ #if !defined(__HIPCC_RTC__) #include #include "amd_hip_vector_types.h" // float2 etc -#include "device_library_decls.h" // ocml conversion functions -#include "math_fwd.h" // ocml device functions #if defined(__clang__) && defined(__HIP__) +#include "math_fwd.h" // ocml device functions #include // define warpSize #include // Sync functions #endif @@ -338,7 +337,11 @@ struct __attribute__((aligned(2))) __hip_bfloat16 { }; /**@}*/ +#if defined(__clang__) typedef __bf16 __bf16_2 __attribute__((ext_vector_type(2))); +#else +typedef __bf16 __bf16_2 __attribute__((vector_size(sizeof(__bf16) * 2))); +#endif /** * \defgroup HIP_INTRINSIC_BFLOAT162_STRUCT @@ -350,6 +353,7 @@ struct __attribute__((aligned(4))) __hip_bfloat162 { static_assert(sizeof(__hip_bfloat16[2]) == sizeof(__bf16_2)); public: +#if defined(__clang__) union { struct { __hip_bfloat16 x; /*! \brief raw representation of bfloat16 */ @@ -357,7 +361,12 @@ struct __attribute__((aligned(4))) __hip_bfloat162 { }; __bf16_2 __xy_bf162; }; - +#else + /* GCC does not support anonymous structs with members that have non-trivial constructors (Clang + allows this as an extension). Expose x and y directly instead. */ + __hip_bfloat16 x; + __hip_bfloat16 y; +#endif public: /*! \brief create __hip_bfloat162 from __hip_bfloat162_raw */ @@ -373,7 +382,11 @@ struct __attribute__((aligned(4))) __hip_bfloat162 { : x(a), y(b) {} /*! \brief create __hip_bfloat162 from vector of __bf16_2 */ +#if defined(__clang__) __BF16_HOST_DEVICE__ __hip_bfloat162(const __bf16_2 in) : __xy_bf162(in) {} +#else + __BF16_HOST_DEVICE__ __hip_bfloat162(const __bf16_2 in) : x{in[0]}, y{in[1]} {} +#endif /*! \brief default constructor of __hip_bfloat162 */ __BF16_HOST_DEVICE__ __hip_bfloat162() = default; @@ -392,11 +405,22 @@ struct __attribute__((aligned(4))) __hip_bfloat162 { } /*! \brief return a vector of bf16 */ - __BF16_HOST_DEVICE__ operator __bf16_2() const { return __xy_bf162; } + __BF16_HOST_DEVICE__ operator __bf16_2() const { +#if defined(__clang__) + return __xy_bf162; +#else + return __bf16_2{x, y}; +#endif + } /*! \brief return a vector of bf16 */ __BF16_HOST_DEVICE__ __hip_bfloat162& operator=(const __bf16_2 in) { +#if defined(__clang__) __xy_bf162 = in; +#else + x = __hip_bfloat16{in[0]}; + y = __hip_bfloat16{in[1]}; +#endif return *this; } @@ -835,6 +859,7 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hdiv(const __hip_bfloat16 a, const return (__bf16)a / (__bf16)b; } +#if defined(__clang__) && defined(__HIP__) /** * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH * \brief Performs FMA of given bfloat16 values @@ -844,6 +869,7 @@ __BF16_DEVICE_STATIC__ __hip_bfloat16 __hfma(const __hip_bfloat16 a, const __hip return __hip_bfloat16(__builtin_elementwise_fma(__bf16(a), __bf16(b), __bf16(c))); ; } +#endif /** * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH @@ -919,6 +945,8 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hadd2_rn(const __hip_bfloat162 a, return __hip_bfloat162{__bf16_2(a) + __bf16_2(b)}; } + +#if defined(__clang__) && defined(__HIP__) /** * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH * \brief Performs FMA of given bfloat162 values @@ -927,6 +955,7 @@ __BF16_DEVICE_STATIC__ __hip_bfloat162 __hfma2(const __hip_bfloat162 a, const __ const __hip_bfloat162 c) { return __hip_bfloat162{__builtin_elementwise_fma(__bf16_2(a), __bf16_2(b), __bf16_2(c))}; } +#endif /** * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH @@ -1639,6 +1668,7 @@ __BF16_HOST_DEVICE_STATIC__ bool operator>=(const __hip_bfloat162& l, const __hi return fl.x >= fr.x && fl.x >= fr.y; } +#if defined(__clang__) && defined(__HIP__) /** * \ingroup HIP_INTRINSIC_BFLOAT16_MATH * \brief Calculate ceil of bfloat16 @@ -1883,7 +1913,6 @@ __BF16_DEVICE_STATIC__ __hip_bfloat162 h2trunc(const __hip_bfloat162 h) { return __hip_bfloat162(htrunc(h.x), htrunc(h.y)); } -#if defined(__clang__) && defined(__HIP__) /** * \ingroup HIP_INTRINSIC_BFLOAT162_MATH * \brief Atomic add bfloat162 diff --git a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h index 62cb66fb90..80b7f71c23 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h +++ b/projects/clr/hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h @@ -212,6 +212,19 @@ class grid_group : public thread_group { //! @copydoc thread_group::sync __CG_QUALIFIER__ void sync() const { internal::grid::sync(); } __CG_QUALIFIER__ dim3 group_dim() const { return internal::grid::grid_dim(); } + struct arrival_token { + unsigned int signal; + }; + //! Arrive at a barrier + __CG_QUALIFIER__ arrival_token barrier_arrive() const { + arrival_token t; + t.signal = internal::grid::barrier_signal(); + return t; + } + //! Arrive at a barrier + __CG_QUALIFIER__ void barrier_wait(arrival_token&& t) const { + internal::grid::barrier_wait(t.signal); + } }; /** \ingroup CooperativeGConstruct @@ -295,6 +308,14 @@ class thread_block : public thread_group { __CG_STATIC_QUALIFIER__ void sync() { internal::workgroup::sync(); } //! Returns the group dimensions. __CG_QUALIFIER__ dim3 group_dim() { return internal::workgroup::block_dim(); } + struct arrival_token {}; + //! Arrive at a barrier + __CG_QUALIFIER__ arrival_token barrier_arrive() const { + internal::workgroup::barrier_arrive(); + return arrival_token{}; + } + //! Arrive at a barrier + __CG_QUALIFIER__ void barrier_wait(arrival_token&&) const { internal::workgroup::barrier_wait(); } }; /** \ingroup CooperativeGConstruct diff --git a/projects/clr/hipamd/include/hip/amd_detail/device_library_decls.h b/projects/clr/hipamd/include/hip/amd_detail/device_library_decls.h index 39c2e59686..791d2e7137 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/device_library_decls.h +++ b/projects/clr/hipamd/include/hip/amd_detail/device_library_decls.h @@ -97,6 +97,8 @@ extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_size(void); extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_thread_rank(void); extern "C" __device__ __attribute__((const)) int __ockl_multi_grid_is_valid(void); extern "C" __device__ __attribute__((convergent)) void __ockl_multi_grid_sync(void); +extern "C" __device__ __attribute__((const)) uint __ockl_grid_bar_arrive(void); +extern "C" __device__ __attribute__((convergent)) void __ockl_grid_bar_wait(uint); extern "C" __device__ void __ockl_atomic_add_noret_f32(float*, float); diff --git a/projects/clr/hipamd/include/hip/amd_detail/hip_api_trace.hpp b/projects/clr/hipamd/include/hip/amd_detail/hip_api_trace.hpp index f1864eaca4..7845b69e8a 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/hip_api_trace.hpp +++ b/projects/clr/hipamd/include/hip/amd_detail/hip_api_trace.hpp @@ -63,7 +63,7 @@ #define HIP_API_TABLE_STEP_VERSION 0 #define HIP_COMPILER_API_TABLE_STEP_VERSION 0 #define HIP_TOOLS_API_TABLE_STEP_VERSION 0 -#define HIP_RUNTIME_API_TABLE_STEP_VERSION 20 +#define HIP_RUNTIME_API_TABLE_STEP_VERSION 21 // HIP API interface // HIP compiler dispatch functions @@ -1113,6 +1113,9 @@ typedef hipError_t (*t_hipKernelGetLibrary)(hipLibrary_t* library, hipKernel_t k typedef hipError_t (*t_hipKernelGetName)(const char** name, hipKernel_t kernel); typedef hipError_t (*t_hipGetProcAddress_spt)(const char* symbol, void** pfn, int hipVersion, uint64_t flags, hipDriverProcAddressQueryResult* symbolStatus); +typedef hipError_t (*t_hipExtDisableLogging)(); +typedef hipError_t (*t_hipExtEnableLogging)(); +typedef hipError_t (*t_hipExtSetLoggingParams)(size_t log_level, size_t log_size, size_t log_mask); typedef hipError_t (*t_hipKernelGetParamInfo)(hipKernel_t kernel, size_t paramIndex, size_t* paramOffset, size_t* paramSize); @@ -1707,8 +1710,13 @@ struct HipDispatchTable { // HIP_RUNTIME_API_TABLE_STEP_VERSION == 20 t_hipKernelGetParamInfo hipKernelGetParamInfo_fn; - // DO NOT EDIT ABOVE! // HIP_RUNTIME_API_TABLE_STEP_VERSION == 21 + t_hipExtDisableLogging hipExtDisableLogging_fn; + t_hipExtEnableLogging hipExtEnableLogging_fn; + t_hipExtSetLoggingParams hipExtSetLoggingParams_fn; + + // DO NOT EDIT ABOVE! + // HIP_RUNTIME_API_TABLE_STEP_VERSION == 22 // ******************************************************************************************* // // diff --git a/projects/clr/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h b/projects/clr/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h index d9671eb342..771e8419ff 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h +++ b/projects/clr/hipamd/include/hip/amd_detail/hip_cooperative_groups_helper.h @@ -196,6 +196,9 @@ __CG_STATIC_QUALIFIER__ dim3 grid_dim() { static_cast<__hip_uint32_t>(gridDim.z))); } +__CG_STATIC_QUALIFIER__ unsigned int barrier_signal() { return __ockl_grid_bar_arrive(); } + +__CG_STATIC_QUALIFIER__ void barrier_wait(unsigned int s) { __ockl_grid_bar_wait(s); } } // namespace grid /** @@ -238,6 +241,23 @@ __CG_STATIC_QUALIFIER__ dim3 block_dim() { static_cast<__hip_uint32_t>(blockDim.z))); } +__CG_STATIC_QUALIFIER__ void barrier_arrive() { + __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup"); +#if __has_builtin(__builtin_amdgcn_s_barrier_signal) && \ + __has_builtin(__builtin_amdgcn_s_barrier_wait) + __builtin_amdgcn_s_barrier_signal(-1); +#endif // __builtin_amdgcn_s_barrier_signal && __builtin_amdgcn_s_barrier_wait +} + +__CG_STATIC_QUALIFIER__ void barrier_wait() { +#if __has_builtin(__builtin_amdgcn_s_barrier_signal) && \ + __has_builtin(__builtin_amdgcn_s_barrier_wait) + __builtin_amdgcn_s_barrier_wait(-1); +#else + __builtin_amdgcn_s_barrier(); +#endif // __builtin_amdgcn_s_barrier_signal && __builtin_amdgcn_s_barrier_wait + __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup"); +} } // namespace workgroup namespace tiled_group { diff --git a/projects/clr/hipamd/include/hip/amd_detail/hip_prof_str.h b/projects/clr/hipamd/include/hip/amd_detail/hip_prof_str.h index 7aef3e7f52..2dc1a0d3b2 100644 --- a/projects/clr/hipamd/include/hip/amd_detail/hip_prof_str.h +++ b/projects/clr/hipamd/include/hip/amd_detail/hip_prof_str.h @@ -468,7 +468,10 @@ enum hip_api_id_t { HIP_API_ID_hipKernelGetName = 448, HIP_API_ID_hipOccupancyAvailableDynamicSMemPerBlock = 449, HIP_API_ID_hipKernelGetParamInfo = 450, - HIP_API_ID_LAST = 450, + HIP_API_ID_hipExtDisableLogging = 451, + HIP_API_ID_hipExtEnableLogging = 452, + HIP_API_ID_hipExtSetLoggingParams = 453, + HIP_API_ID_LAST = 453, HIP_API_ID_hipChooseDevice = HIP_API_ID_CONCAT(HIP_API_ID_,hipChooseDevice), HIP_API_ID_hipGetDeviceProperties = HIP_API_ID_CONCAT(HIP_API_ID_,hipGetDeviceProperties), @@ -590,12 +593,15 @@ static inline const char* hip_api_name(const uint32_t id) { case HIP_API_ID_hipEventRecord: return "hipEventRecord"; case HIP_API_ID_hipEventRecordWithFlags: return "hipEventRecordWithFlags"; case HIP_API_ID_hipEventSynchronize: return "hipEventSynchronize"; + case HIP_API_ID_hipExtDisableLogging: return "hipExtDisableLogging"; + case HIP_API_ID_hipExtEnableLogging: return "hipExtEnableLogging"; case HIP_API_ID_hipExtGetLastError: return "hipExtGetLastError"; case HIP_API_ID_hipExtGetLinkTypeAndHopCount: return "hipExtGetLinkTypeAndHopCount"; case HIP_API_ID_hipExtLaunchKernel: return "hipExtLaunchKernel"; case HIP_API_ID_hipExtLaunchMultiKernelMultiDevice: return "hipExtLaunchMultiKernelMultiDevice"; case HIP_API_ID_hipExtMallocWithFlags: return "hipExtMallocWithFlags"; case HIP_API_ID_hipExtModuleLaunchKernel: return "hipExtModuleLaunchKernel"; + case HIP_API_ID_hipExtSetLoggingParams: return "hipExtSetLoggingParams"; case HIP_API_ID_hipExtStreamCreateWithCUMask: return "hipExtStreamCreateWithCUMask"; case HIP_API_ID_hipExtStreamGetCUMask: return "hipExtStreamGetCUMask"; case HIP_API_ID_hipExternalMemoryGetMappedBuffer: return "hipExternalMemoryGetMappedBuffer"; @@ -1034,12 +1040,15 @@ static inline uint32_t hipApiIdByName(const char* name) { if (strcmp("hipEventRecord", name) == 0) return HIP_API_ID_hipEventRecord; if (strcmp("hipEventRecordWithFlags", name) == 0) return HIP_API_ID_hipEventRecordWithFlags; if (strcmp("hipEventSynchronize", name) == 0) return HIP_API_ID_hipEventSynchronize; + if (strcmp("hipExtDisableLogging", name) == 0) return HIP_API_ID_hipExtDisableLogging; + if (strcmp("hipExtEnableLogging", name) == 0) return HIP_API_ID_hipExtEnableLogging; if (strcmp("hipExtGetLastError", name) == 0) return HIP_API_ID_hipExtGetLastError; if (strcmp("hipExtGetLinkTypeAndHopCount", name) == 0) return HIP_API_ID_hipExtGetLinkTypeAndHopCount; if (strcmp("hipExtLaunchKernel", name) == 0) return HIP_API_ID_hipExtLaunchKernel; if (strcmp("hipExtLaunchMultiKernelMultiDevice", name) == 0) return HIP_API_ID_hipExtLaunchMultiKernelMultiDevice; if (strcmp("hipExtMallocWithFlags", name) == 0) return HIP_API_ID_hipExtMallocWithFlags; if (strcmp("hipExtModuleLaunchKernel", name) == 0) return HIP_API_ID_hipExtModuleLaunchKernel; + if (strcmp("hipExtSetLoggingParams", name) == 0) return HIP_API_ID_hipExtSetLoggingParams; if (strcmp("hipExtStreamCreateWithCUMask", name) == 0) return HIP_API_ID_hipExtStreamCreateWithCUMask; if (strcmp("hipExtStreamGetCUMask", name) == 0) return HIP_API_ID_hipExtStreamGetCUMask; if (strcmp("hipExternalMemoryGetMappedBuffer", name) == 0) return HIP_API_ID_hipExternalMemoryGetMappedBuffer; @@ -1851,6 +1860,11 @@ typedef struct hip_api_data_s { hipEvent_t stopEvent; unsigned int flags; } hipExtModuleLaunchKernel; + struct { + size_t log_level; + size_t log_size; + size_t log_mask; + } hipExtSetLoggingParams; struct { hipStream_t* stream; hipStream_t stream__val; @@ -4484,6 +4498,12 @@ typedef struct hip_api_data_s { #define INIT_hipEventSynchronize_CB_ARGS_DATA(cb_data) { \ cb_data.args.hipEventSynchronize.event = (hipEvent_t)event; \ }; +// hipExtDisableLogging[] +#define INIT_hipExtDisableLogging_CB_ARGS_DATA(cb_data) { \ +}; +// hipExtEnableLogging[] +#define INIT_hipExtEnableLogging_CB_ARGS_DATA(cb_data) { \ +}; // hipExtGetLastError[] #define INIT_hipExtGetLastError_CB_ARGS_DATA(cb_data) { \ }; @@ -4535,6 +4555,12 @@ typedef struct hip_api_data_s { cb_data.args.hipExtModuleLaunchKernel.stopEvent = (hipEvent_t)stopEvent; \ cb_data.args.hipExtModuleLaunchKernel.flags = (unsigned int)flags; \ }; +// hipExtSetLoggingParams[('size_t', 'log_level'), ('size_t', 'log_size'), ('size_t', 'log_mask')] +#define INIT_hipExtSetLoggingParams_CB_ARGS_DATA(cb_data) { \ + cb_data.args.hipExtSetLoggingParams.log_level = (size_t)log_level; \ + cb_data.args.hipExtSetLoggingParams.log_size = (size_t)log_size; \ + cb_data.args.hipExtSetLoggingParams.log_mask = (size_t)log_mask; \ +}; // hipExtStreamCreateWithCUMask[('hipStream_t*', 'stream'), ('unsigned int', 'cuMaskSize'), ('const unsigned int*', 'cuMask')] #define INIT_hipExtStreamCreateWithCUMask_CB_ARGS_DATA(cb_data) { \ cb_data.args.hipExtStreamCreateWithCUMask.stream = (hipStream_t*)stream; \ @@ -7125,6 +7151,12 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) { // hipEventSynchronize[('hipEvent_t', 'event')] case HIP_API_ID_hipEventSynchronize: break; +// hipExtDisableLogging[] + case HIP_API_ID_hipExtDisableLogging: + break; +// hipExtEnableLogging[] + case HIP_API_ID_hipExtEnableLogging: + break; // hipExtGetLastError[] case HIP_API_ID_hipExtGetLastError: break; @@ -7150,6 +7182,9 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) { if (data->args.hipExtModuleLaunchKernel.kernelParams) data->args.hipExtModuleLaunchKernel.kernelParams__val = *(data->args.hipExtModuleLaunchKernel.kernelParams); if (data->args.hipExtModuleLaunchKernel.extra) data->args.hipExtModuleLaunchKernel.extra__val = *(data->args.hipExtModuleLaunchKernel.extra); break; +// hipExtSetLoggingParams[('size_t', 'log_level'), ('size_t', 'log_size'), ('size_t', 'log_mask')] + case HIP_API_ID_hipExtSetLoggingParams: + break; // hipExtStreamCreateWithCUMask[('hipStream_t*', 'stream'), ('unsigned int', 'cuMaskSize'), ('const unsigned int*', 'cuMask')] case HIP_API_ID_hipExtStreamCreateWithCUMask: if (data->args.hipExtStreamCreateWithCUMask.stream) data->args.hipExtStreamCreateWithCUMask.stream__val = *(data->args.hipExtStreamCreateWithCUMask.stream); @@ -9124,6 +9159,14 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da oss << "event="; roctracer::hip_support::detail::operator<<(oss, data->args.hipEventSynchronize.event); oss << ")"; break; + case HIP_API_ID_hipExtDisableLogging: + oss << "hipExtDisableLogging("; + oss << ")"; + break; + case HIP_API_ID_hipExtEnableLogging: + oss << "hipExtEnableLogging("; + oss << ")"; + break; case HIP_API_ID_hipExtGetLastError: oss << "hipExtGetLastError("; oss << ")"; @@ -9188,6 +9231,13 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtModuleLaunchKernel.flags); oss << ")"; break; + case HIP_API_ID_hipExtSetLoggingParams: + oss << "hipExtSetLoggingParams("; + oss << "log_level="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtSetLoggingParams.log_level); + oss << ", log_size="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtSetLoggingParams.log_size); + oss << ", log_mask="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtSetLoggingParams.log_mask); + oss << ")"; + break; case HIP_API_ID_hipExtStreamCreateWithCUMask: oss << "hipExtStreamCreateWithCUMask("; if (data->args.hipExtStreamCreateWithCUMask.stream == NULL) oss << "stream=NULL"; diff --git a/projects/clr/hipamd/src/CMakeLists.txt b/projects/clr/hipamd/src/CMakeLists.txt index aa8d307688..0f7c537ba4 100644 --- a/projects/clr/hipamd/src/CMakeLists.txt +++ b/projects/clr/hipamd/src/CMakeLists.txt @@ -110,6 +110,7 @@ target_sources(amdhip64 PRIVATE hip_graph.cpp hip_hmm.cpp hip_intercept.cpp + hip_log.cpp hip_memory.cpp hip_mempool.cpp hip_mempool_impl.cpp diff --git a/projects/clr/hipamd/src/amdhip.def b/projects/clr/hipamd/src/amdhip.def index dff046d1ac..9a12cbf080 100644 --- a/projects/clr/hipamd/src/amdhip.def +++ b/projects/clr/hipamd/src/amdhip.def @@ -523,3 +523,6 @@ hipKernelGetName hipOccupancyAvailableDynamicSMemPerBlock hipGetProcAddress_spt hipKernelGetParamInfo +hipExtDisableLogging +hipExtEnableLogging +hipExtSetLoggingParams diff --git a/projects/clr/hipamd/src/hip_api_trace.cpp b/projects/clr/hipamd/src/hip_api_trace.cpp index 6399924ec2..564f0bccea 100644 --- a/projects/clr/hipamd/src/hip_api_trace.cpp +++ b/projects/clr/hipamd/src/hip_api_trace.cpp @@ -885,6 +885,9 @@ hipError_t hipOccupancyAvailableDynamicSMemPerBlock(size_t* dynamicSmemSize, con int numBlocks, int blockSize); hipError_t hipKernelGetParamInfo(hipKernel_t kernel, size_t paramIndex, size_t* paramOffset, size_t* paramSize); +hipError_t hipExtDisableLogging(); +hipError_t hipExtEnableLogging(); +hipError_t hipExtSetLoggingParams(size_t log_level, size_t log_size, size_t log_mask); } // namespace hip namespace hip { @@ -1432,6 +1435,9 @@ void UpdateDispatchTable(HipDispatchTable* ptrDispatchTable) { ptrDispatchTable->hipKernelGetName_fn = hip::hipKernelGetName; ptrDispatchTable->hipOccupancyAvailableDynamicSMemPerBlock_fn = hip::hipOccupancyAvailableDynamicSMemPerBlock; ptrDispatchTable->hipKernelGetParamInfo_fn = hip::hipKernelGetParamInfo; + ptrDispatchTable->hipExtDisableLogging_fn = hip::hipExtDisableLogging; + ptrDispatchTable->hipExtEnableLogging_fn = hip::hipExtEnableLogging; + ptrDispatchTable->hipExtSetLoggingParams_fn = hip::hipExtSetLoggingParams; } #if HIP_ROCPROFILER_REGISTER > 0 @@ -2114,15 +2120,19 @@ HIP_ENFORCE_ABI(HipDispatchTable, hipOccupancyAvailableDynamicSMemPerBlock_fn, 5 HIP_ENFORCE_ABI(HipDispatchTable, hipGetProcAddress_spt_fn, 506); // HIP_RUNTIME_API_TABLE_STEP_VERSION == 20 HIP_ENFORCE_ABI(HipDispatchTable, hipKernelGetParamInfo_fn, 507); +// HIP_RUNTIME_API_TABLE_STEP_VERSION == 21 +HIP_ENFORCE_ABI(HipDispatchTable, hipExtDisableLogging_fn, 508); +HIP_ENFORCE_ABI(HipDispatchTable, hipExtEnableLogging_fn, 509); +HIP_ENFORCE_ABI(HipDispatchTable, hipExtSetLoggingParams_fn, 510); // if HIP_ENFORCE_ABI entries are added for each new function pointer in the table, the number below // will be +1 of the number in the last HIP_ENFORCE_ABI line. E.g.: // // HIP_ENFORCE_ABI(, , 8) // // HIP_ENFORCE_ABI_VERSIONING(
, 9) <- 8 + 1 = 9 -HIP_ENFORCE_ABI_VERSIONING(HipDispatchTable, 508) +HIP_ENFORCE_ABI_VERSIONING(HipDispatchTable, 511) -static_assert(HIP_RUNTIME_API_TABLE_MAJOR_VERSION == 0 && HIP_RUNTIME_API_TABLE_STEP_VERSION == 20, +static_assert(HIP_RUNTIME_API_TABLE_MAJOR_VERSION == 0 && HIP_RUNTIME_API_TABLE_STEP_VERSION == 21, "If you get this error, add new HIP_ENFORCE_ABI(...) code for the new function " "pointers and then update this check so it is true"); #endif diff --git a/projects/clr/hipamd/src/hip_hcc.map.in b/projects/clr/hipamd/src/hip_hcc.map.in index 37437589e8..79c15835a5 100644 --- a/projects/clr/hipamd/src/hip_hcc.map.in +++ b/projects/clr/hipamd/src/hip_hcc.map.in @@ -645,6 +645,9 @@ global: hipOccupancyAvailableDynamicSMemPerBlock; hipGetProcAddress_spt; hipKernelGetParamInfo; + hipExtDisableLogging; + hipExtEnableLogging; + hipExtSetLoggingParams; local: *; } hip_7.1; diff --git a/projects/clr/hipamd/src/hip_log.cpp b/projects/clr/hipamd/src/hip_log.cpp new file mode 100644 index 0000000000..6d43c0dc77 --- /dev/null +++ b/projects/clr/hipamd/src/hip_log.cpp @@ -0,0 +1,31 @@ +#include +#include "hip_internal.hpp" +#include "hip_platform.hpp" + +namespace hip { + +hipError_t hipExtEnableLogging() { + HIP_INIT_API(hipExtEnableLogging); + amd::ScopedLock lock(PlatformState::instance().getLogLock()); + AMD_LOG_LEVEL = PlatformState::instance().log_level_; + AMD_LOG_MASK = PlatformState::instance().log_mask_; + HIP_RETURN(hipSuccess); +} + +hipError_t hipExtDisableLogging() { + HIP_INIT_API(hipExtDisableLogging); + amd::ScopedLock lock(PlatformState::instance().getLogLock()); + AMD_LOG_LEVEL = 0; + HIP_RETURN(hipSuccess); +} + +hipError_t hipExtSetLoggingParams(size_t log_level, size_t log_size, size_t log_mask) { + HIP_INIT_API(hipExtSetLoggingParams, log_level, log_size, log_mask); + amd::ScopedLock lock(PlatformState::instance().getLogLock()); + // Store logging parameters for later activation + PlatformState::instance().log_level_ = log_level; + PlatformState::instance().log_size_ = log_size; + PlatformState::instance().log_mask_ = log_mask; + HIP_RETURN(hipSuccess); +} +} // namespace::hip \ No newline at end of file diff --git a/projects/clr/hipamd/src/hip_platform.hpp b/projects/clr/hipamd/src/hip_platform.hpp index bd714206ef..ba8261f3d2 100644 --- a/projects/clr/hipamd/src/hip_platform.hpp +++ b/projects/clr/hipamd/src/hip_platform.hpp @@ -50,9 +50,12 @@ class PlatformState { // Unique FD Store Lock amd::Monitor ufd_lock_{true}; + // Lock for logging operations + amd::Monitor lg_lock_{true}; + // Singleton object static PlatformState* platform_; - PlatformState() {} + PlatformState() : log_level_(0), log_size_(0), log_mask_(0) {} ~PlatformState() {} public: @@ -113,6 +116,14 @@ class PlatformState { size_t UfdMapSize() const { return ufd_map_.size(); } + // Logging lock accessor + amd::Monitor& getLogLock() { return lg_lock_; } + + // Friend functions for logging access + friend hipError_t hipExtEnableLogging(); + friend hipError_t hipExtDisableLogging(); + friend hipError_t hipExtSetLoggingParams(size_t log_level, size_t log_size, size_t log_mask); + inline bool RegisterLibraryFunction(const hipKernel_t f, const hipLibrary_t l) { amd::ScopedLock lock(lock_); if (library_functions_.find(f) == library_functions_.end()) { @@ -150,5 +161,10 @@ class PlatformState { void* dynamicLibraryHandle_{nullptr}; std::unordered_map library_functions_; + + // Logging state (moved from LoggingInfo singleton) + size_t log_level_; + size_t log_size_; + size_t log_mask_; }; } // namespace hip diff --git a/projects/clr/hipamd/src/hip_table_interface.cpp b/projects/clr/hipamd/src/hip_table_interface.cpp index 443ba81ba5..d8783c9ee0 100644 --- a/projects/clr/hipamd/src/hip_table_interface.cpp +++ b/projects/clr/hipamd/src/hip_table_interface.cpp @@ -2067,4 +2067,13 @@ hipError_t hipKernelGetParamInfo(hipKernel_t kernel, size_t paramIndex, size_t* size_t* paramSize) { return hip::GetHipDispatchTable()->hipKernelGetParamInfo_fn(kernel, paramIndex, paramOffset, paramSize); +} +hipError_t hipExtEnableLogging() { + return hip::GetHipDispatchTable()->hipExtEnableLogging_fn(); +} +hipError_t hipExtDisableLogging() { + return hip::GetHipDispatchTable()->hipExtDisableLogging_fn(); +} +hipError_t hipExtSetLoggingParams(size_t log_level, size_t log_size, size_t log_mask) { + return hip::GetHipDispatchTable()->hipExtSetLoggingParams_fn(log_level, log_size, log_mask); } \ No newline at end of file diff --git a/projects/hip-tests/catch/hipTestMain/config/config_amd_linux b/projects/hip-tests/catch/hipTestMain/config/config_amd_linux index 5413570155..6a168deafe 100644 --- a/projects/hip-tests/catch/hipTestMain/config/config_amd_linux +++ b/projects/hip-tests/catch/hipTestMain/config/config_amd_linux @@ -138,14 +138,6 @@ "=== TODO ===", "Unit_Device_tgammaf_Accuracy_Limited_Positive", "=== TODO === fail on 100% test data", - "Unit_Device_hexp10_Accuracy_Positive", - "Unit_Device_h2exp10_Accuracy_Positive", - "Unit_Device_hexp2_Accuracy_Positive", - "Unit_Device_h2exp2_Accuracy_Positive", - "Unit_Device_hlog_Accuracy_Positive", - "Unit_Device_h2log_Accuracy_Positive", - "Unit_Device_hlog10_Accuracy_Positive", - "Unit_Device_h2log10_Accuracy_Positive", "Unit_Device___hfma2_Accuracy_Positive", #endif #if defined gfx90a || defined gfx942 || defined gfx950 diff --git a/projects/hip-tests/catch/hipTestMain/config/config_amd_windows b/projects/hip-tests/catch/hipTestMain/config/config_amd_windows index 85e17cd381..24e5cefcf5 100644 --- a/projects/hip-tests/catch/hipTestMain/config/config_amd_windows +++ b/projects/hip-tests/catch/hipTestMain/config/config_amd_windows @@ -752,6 +752,8 @@ #endif "=== Following tests disabled as it should be a local perf test", "Performance_hipExtLaunchKernelGGL_QueryGPUFrequency", + "Unit_hipDynamicLogging_Positive_Basic", + "Unit_hipDynamicLogging_Positive_MultipleEnableDisable", "End of json" ] } diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/CMakeLists.txt b/projects/hip-tests/catch/unit/cooperativeGrps/CMakeLists.txt index f22d40f944..c0d2a00e3c 100644 --- a/projects/hip-tests/catch/unit/cooperativeGrps/CMakeLists.txt +++ b/projects/hip-tests/catch/unit/cooperativeGrps/CMakeLists.txt @@ -20,6 +20,7 @@ set(TEST_SRC binary_partition.cc cg_ballot.cc cg_any_all.cc + split_barrier.cc ) if(HIP_PLATFORM STREQUAL "nvidia") diff --git a/projects/hip-tests/catch/unit/cooperativeGrps/split_barrier.cc b/projects/hip-tests/catch/unit/cooperativeGrps/split_barrier.cc new file mode 100644 index 0000000000..f7376b6605 --- /dev/null +++ b/projects/hip-tests/catch/unit/cooperativeGrps/split_barrier.cc @@ -0,0 +1,123 @@ +/* +Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include + +static __global__ void wg_split_barrier(float *out, float *in) { + namespace cg = cooperative_groups; + + __shared__ float mid[32]; + size_t i = threadIdx.x; + auto tb = cg::this_thread_block(); + + out[i] = in[i] * 2.0f; + + auto tok = tb.barrier_arrive(); + + // use tid 0 to populate shared mem + if (i == 0) { + for (size_t j = 0; j < 32; j++) { + mid[j] = in[j]; + } + } + + tb.barrier_wait(std::move(tok)); + + out[i] += mid[i]; +} + +TEST_CASE("Unit_coop_thread_block_split_barrier") { + constexpr size_t size = 32; + float *d_out, *d_in; + + HIP_CHECK(hipMalloc(&d_out, sizeof(float) * size)); + HIP_CHECK(hipMalloc(&d_in, sizeof(float) * size)); + + std::vector in(size, 0.0f), out = in; + for (size_t i = 0; i < size; i++) { + in[i] = i + 1; + } + + HIP_CHECK(hipMemset(d_out, 0, sizeof(float) * size)); + HIP_CHECK( + hipMemcpy(d_in, in.data(), sizeof(float) * size, hipMemcpyHostToDevice)); + wg_split_barrier<<<1, size>>>(d_out, d_in); + HIP_CHECK(hipMemcpy(out.data(), d_out, sizeof(float) * size, + hipMemcpyDeviceToHost)); + + HIP_CHECK(hipFree(d_out)); + HIP_CHECK(hipFree(d_in)); + + for (size_t i = 0; i < size; i++) { + INFO("Index: " << i << " in: " << in[i] << " out: " << out[i]); + REQUIRE((in[i] * 3.0f) == Catch::Approx(out[i])); + } +} + +static __global__ void grid_split_barrier(int *data, int *result, int N) { + namespace cg = cooperative_groups; + cg::grid_group grid = cg::this_grid(); + + int gid = blockIdx.x * blockDim.x + threadIdx.x; + auto tok = grid.barrier_arrive(); + if (gid < N) { + data[gid] = gid + 1; + } + + grid.barrier_wait(std::move(tok)); + + if (grid.thread_rank() == 0) { + int sum = 0; + for (int i = 0; i < N; i++) + sum += data[i]; + *result = sum; + } +} + +TEST_CASE("Unit_coop_grids_split_barrier") { + hipDeviceProp_t prop; + HIP_CHECK(hipGetDeviceProperties(&prop, 0)); + + if (prop.cooperativeLaunch != 0) { + int N = 1024; + const int threads = 128; + const int blocks = (N + threads - 1) / threads; + + int *d_in, *d_out; + HIP_CHECK(hipMalloc(&d_in, N * sizeof(int))); + HIP_CHECK(hipMalloc(&d_out, sizeof(int))); + + void *args[] = {&d_in, &d_out, &N}; + + dim3 grid(blocks); + dim3 block(threads); + + HIP_CHECK(hipLaunchCooperativeKernel((void *)grid_split_barrier, grid, + block, args, 0, 0)); + HIP_CHECK(hipDeviceSynchronize()); + + int out = 0; + HIP_CHECK(hipMemcpy(&out, d_out, sizeof(int), hipMemcpyDeviceToHost)); + + HIP_CHECK(hipFree(d_in)); + HIP_CHECK(hipFree(d_out)); + REQUIRE(out == ((N * (N + 1)) / 2)); + } +} diff --git a/projects/hip-tests/catch/unit/errorHandling/CMakeLists.txt b/projects/hip-tests/catch/unit/errorHandling/CMakeLists.txt index b1e9220bfc..873417baf7 100644 --- a/projects/hip-tests/catch/unit/errorHandling/CMakeLists.txt +++ b/projects/hip-tests/catch/unit/errorHandling/CMakeLists.txt @@ -7,6 +7,7 @@ set(TEST_SRC hipDrvGetErrorString.cc hipGetLastError.cc hipPeekAtLastError.cc + hipDynamicLogging.cc ) if(UNIX) diff --git a/projects/hip-tests/catch/unit/errorHandling/OutCapture.hh b/projects/hip-tests/catch/unit/errorHandling/OutCapture.hh new file mode 100644 index 0000000000..febd7e429f --- /dev/null +++ b/projects/hip-tests/catch/unit/errorHandling/OutCapture.hh @@ -0,0 +1,134 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#ifdef _WIN32 +#include +#include +#include +#include +#define dup _dup +#define dup2 _dup2 +#define fd_close _close +#define unlink _unlink +#define STDERR_FD _fileno(stderr) +#define OPEN_FLAGS (_O_WRONLY | _O_CREAT | _O_TRUNC) +#define OPEN_MODE (_S_IREAD | _S_IWRITE) +#define open _open +#else +#include +#define fd_close close +#define STDERR_FD STDERR_FILENO +#define OPEN_FLAGS (O_WRONLY | O_CREAT | O_TRUNC) +#define OPEN_MODE 0644 +#endif + +// Class to capture all stderr output (HIP logging uses stderr) +class OutCapture { +private: + std::stringstream captured_stream_; + std::streambuf* cerr_backup_; + int stderr_backup_; + std::string temp_file_; + + static std::string getTempFilePath() { +#ifdef _WIN32 + char temp_path[MAX_PATH]; + if (GetTempPathA(MAX_PATH, temp_path)) { + return std::string(temp_path) + "hip_stderr_capture.txt"; + } + // Fallback to current directory + return "hip_stderr_capture.txt"; +#else + return "/tmp/hip_stderr_capture.txt"; +#endif + } + +public: + OutCapture() : temp_file_(getTempFilePath()) { + // Backup original cerr stream buffer (HIP logging uses stderr) + cerr_backup_ = std::cerr.rdbuf(); + + // Backup original stderr file descriptor + stderr_backup_ = dup(STDERR_FD); + } + + void startCapture() { + // Clear any previous content + captured_stream_.str(""); + captured_stream_.clear(); + + // Redirect std::cerr to our stringstream + std::cerr.rdbuf(captured_stream_.rdbuf()); + + // Redirect stderr file descriptor to temp file (for fprintf to stderr) + int temp_fd = open(temp_file_.c_str(), OPEN_FLAGS, OPEN_MODE); + if (temp_fd != -1) { + dup2(temp_fd, STDERR_FD); + fd_close(temp_fd); + } + } + + std::string stopCapture() { + // Restore original cerr stream + std::cerr.rdbuf(cerr_backup_); + + // Restore original stderr file descriptor + dup2(stderr_backup_, STDERR_FD); + + // Read from temp file (captures fprintf(stderr) output from HIP logging) + std::ifstream temp_file(temp_file_); + std::string file_content; + if (temp_file.is_open()) { + std::string line; + while (std::getline(temp_file, line)) { + file_content += line + "\n"; + } + temp_file.close(); + } + + // Combine both captures: C++ streams and file descriptor output + std::string stream_content = captured_stream_.str(); + std::string total_output = stream_content + file_content; + + // Clean up temp file + unlink(temp_file_.c_str()); + + return total_output; + } + + ~OutCapture() { + // Ensure everything is restored + std::cerr.rdbuf(cerr_backup_); + dup2(stderr_backup_, STDERR_FD); + fd_close(stderr_backup_); + unlink(temp_file_.c_str()); + } +}; diff --git a/projects/hip-tests/catch/unit/errorHandling/hipDynamicLogging.cc b/projects/hip-tests/catch/unit/errorHandling/hipDynamicLogging.cc new file mode 100644 index 0000000000..d242b12bb1 --- /dev/null +++ b/projects/hip-tests/catch/unit/errorHandling/hipDynamicLogging.cc @@ -0,0 +1,156 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include "OutCapture.hh" + +/** + * @addtogroup hipDynamicLogging hipDynamicLogging + * @{ + * @ingroup ErrorTest + * `hipExtSetLoggingParams(size_t log_level, size_t log_size, size_t log_mask)` - + * Sets logging parameters for HIP runtime. + * `hipExtEnableLogging()` - + * Enables HIP runtime logging. + * `hipExtDisableLogging()` - + * Disables HIP runtime logging. + */ + +static bool hipDynamicLoggingTest() { + // Create output capture instance + OutCapture capture; + capture.startCapture(); + + // Set Logging params + HIP_CHECK(hipExtSetLoggingParams(4, 0, -1)); + + // Logging is disabled here - allocate memory + int* dptr = nullptr; + HIP_CHECK(hipMalloc(&dptr, sizeof(int))); + + // Stop capture after hipMalloc and check no output (logging disabled) + std::string malloc_output = capture.stopCapture(); + if (malloc_output.size() != 0) { + INFO("Unexpected logging output during hipMalloc (logging should be disabled): " << malloc_output); + return false; + } + + // Start capture before enabling logging + capture.startCapture(); + + // Enable logging and do memset + HIP_CHECK(hipExtEnableLogging()); + HIP_CHECK(hipMemset(dptr, 0x00, sizeof(int))); + + // Disable logging + HIP_CHECK(hipExtDisableLogging()); + + // Stop capture after disabling logging and check for output + std::string logging_output = capture.stopCapture(); + if (logging_output.size() == 0) { + INFO("Expected logging output during enabled logging period, but got none"); + return false; + } + + // Clean up + HIP_CHECK(hipFree(dptr)); + + INFO("Successfully captured HIP logging output (" << logging_output.size() << " bytes)"); + INFO("Logging output: " << logging_output); + + return true; +} + +/** + * Test Description + * ------------------------ + * - Validates that HIP dynamic logging works correctly: + * 1. No output when logging is disabled + * 2. Logging output is captured when logging is enabled + * 3. hipMemset operation produces logging output during enabled period + * Test source + * ------------------------ + * - unit/errorHandling/hipDynamicLogging.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.6 + */ +TEST_CASE("Unit_hipDynamicLogging_Positive_Basic") { + int numDevices = 0; + HIP_CHECK(hipGetDeviceCount(&numDevices)); + + if (numDevices <= 0) { + HipTest::HIP_SKIP_TEST("Skipping hipDynamicLogging test - no devices available"); + return; + } + + REQUIRE(hipDynamicLoggingTest() == true); +} + +/** + * Test Description + * ------------------------ + * - Validates that hipExtSetLoggingParams sets logging parameters correctly + * and that logging can be enabled/disabled multiple times + * Test source + * ------------------------ + * - unit/errorHandling/hipDynamicLogging.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.6 + */ +TEST_CASE("Unit_hipDynamicLogging_Positive_MultipleEnableDisable") { + int numDevices = 0; + HIP_CHECK(hipGetDeviceCount(&numDevices)); + + if (numDevices <= 0) { + HipTest::HIP_SKIP_TEST("Skipping hipDynamicLogging test - no devices available"); + return; + } + + // Test multiple enable/disable cycles + OutCapture capture; + int* dptr = nullptr; + HIP_CHECK(hipMalloc(&dptr, sizeof(int))); + + // Set different logging parameters + HIP_CHECK(hipExtSetLoggingParams(3, 0, -1)); + + for (int i = 0; i < 3; ++i) { + // Start capture and enable logging + capture.startCapture(); + HIP_CHECK(hipExtEnableLogging()); + HIP_CHECK(hipMemset(dptr, 0x42, sizeof(int))); + HIP_CHECK(hipExtDisableLogging()); + + // Check that we captured some output + std::string output = capture.stopCapture(); + REQUIRE(output.size() > 0); + } + + HIP_CHECK(hipFree(dptr)); +} + +/** + * End doxygen group ErrorTest. + * @} + */ diff --git a/projects/hip-tests/catch/unit/math/half_precision_math.cc b/projects/hip-tests/catch/unit/math/half_precision_math.cc index 4cfa766ffd..f599b59133 100644 --- a/projects/hip-tests/catch/unit/math/half_precision_math.cc +++ b/projects/hip-tests/catch/unit/math/half_precision_math.cc @@ -45,7 +45,7 @@ MATH_UNARY_HP_KERNEL_DEF(hcos); * - HIP_VERSION >= 5.2 */ MATH_UNARY_HP_TEST_DEF_IMPL(hcos, static_cast(std::cos), - ULPValidatorBuilderFactory(2)); + ULPValidatorBuilderFactory(1)); MATH_UNARY_HP_KERNEL_DEF(h2cos); @@ -63,7 +63,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2cos); * - HIP_VERSION >= 5.2 */ MATH_UNARY_HP_TEST_DEF_IMPL(h2cos, static_cast(std::cos), - ULPValidatorBuilderFactory(2)); + ULPValidatorBuilderFactory(1)); MATH_UNARY_HP_KERNEL_DEF(hsin); @@ -82,7 +82,7 @@ MATH_UNARY_HP_KERNEL_DEF(hsin); * - HIP_VERSION >= 5.2 */ MATH_UNARY_HP_TEST_DEF_IMPL(hsin, static_cast(std::sin), - ULPValidatorBuilderFactory(2)); + ULPValidatorBuilderFactory(1)); MATH_UNARY_HP_KERNEL_DEF(h2sin); @@ -100,7 +100,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2sin); * - HIP_VERSION >= 5.2 */ MATH_UNARY_HP_TEST_DEF_IMPL(h2sin, static_cast(std::sin), - ULPValidatorBuilderFactory(2)); + ULPValidatorBuilderFactory(1)); MATH_UNARY_HP_KERNEL_DEF(hexp); @@ -119,7 +119,7 @@ MATH_UNARY_HP_KERNEL_DEF(hexp); * - HIP_VERSION >= 5.2 */ MATH_UNARY_HP_TEST_DEF_IMPL(hexp, static_cast(std::exp), - ULPValidatorBuilderFactory(2)); + ULPValidatorBuilderFactory(1)); MATH_UNARY_HP_KERNEL_DEF(h2exp); @@ -137,7 +137,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2exp); * - HIP_VERSION >= 5.2 */ MATH_UNARY_HP_TEST_DEF_IMPL(h2exp, static_cast(std::exp), - ULPValidatorBuilderFactory(2)); + ULPValidatorBuilderFactory(1)); MATH_UNARY_HP_KERNEL_DEF(hexp10); @@ -156,7 +156,7 @@ MATH_UNARY_HP_KERNEL_DEF(hexp10); * - HIP_VERSION >= 5.2 */ MATH_UNARY_HP_TEST_DEF_IMPL(hexp10, static_cast(exp10f), - ULPValidatorBuilderFactory(2)); + ULPValidatorBuilderFactory(1)); MATH_UNARY_HP_KERNEL_DEF(h2exp10); @@ -174,7 +174,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2exp10); * - HIP_VERSION >= 5.2 */ MATH_UNARY_HP_TEST_DEF_IMPL(h2exp10, static_cast(exp10f), - ULPValidatorBuilderFactory(2)); + ULPValidatorBuilderFactory(1)); MATH_UNARY_HP_KERNEL_DEF(hexp2); @@ -193,7 +193,7 @@ MATH_UNARY_HP_KERNEL_DEF(hexp2); * - HIP_VERSION >= 5.2 */ MATH_UNARY_HP_TEST_DEF_IMPL(hexp2, static_cast(std::exp2), - ULPValidatorBuilderFactory(2)); + ULPValidatorBuilderFactory(1)); MATH_UNARY_HP_KERNEL_DEF(h2exp2); @@ -211,7 +211,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2exp2); * - HIP_VERSION >= 5.2 */ MATH_UNARY_HP_TEST_DEF_IMPL(h2exp2, static_cast(std::exp2), - ULPValidatorBuilderFactory(2)); + ULPValidatorBuilderFactory(1)); MATH_UNARY_HP_KERNEL_DEF(hlog); @@ -230,7 +230,7 @@ MATH_UNARY_HP_KERNEL_DEF(hlog); * - HIP_VERSION >= 5.2 */ MATH_UNARY_HP_TEST_DEF_IMPL(hlog, static_cast(std::log), - ULPValidatorBuilderFactory(1)); + ULPValidatorBuilderFactory(1)); MATH_UNARY_HP_KERNEL_DEF(h2log); @@ -248,7 +248,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2log); * - HIP_VERSION >= 5.2 */ MATH_UNARY_HP_TEST_DEF_IMPL(h2log, static_cast(std::log), - ULPValidatorBuilderFactory(1)); + ULPValidatorBuilderFactory(1)); MATH_UNARY_HP_KERNEL_DEF(hlog10); @@ -267,7 +267,7 @@ MATH_UNARY_HP_KERNEL_DEF(hlog10); * - HIP_VERSION >= 5.2 */ MATH_UNARY_HP_TEST_DEF_IMPL(hlog10, static_cast(std::log10), - ULPValidatorBuilderFactory(2)); + ULPValidatorBuilderFactory(1)); MATH_UNARY_HP_KERNEL_DEF(h2log10); @@ -285,7 +285,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2log10); * - HIP_VERSION >= 5.2 */ MATH_UNARY_HP_TEST_DEF_IMPL(h2log10, static_cast(std::log10), - ULPValidatorBuilderFactory(2)); + ULPValidatorBuilderFactory(1)); MATH_UNARY_HP_KERNEL_DEF(hlog2); @@ -304,7 +304,7 @@ MATH_UNARY_HP_KERNEL_DEF(hlog2); * - HIP_VERSION >= 5.2 */ MATH_UNARY_HP_TEST_DEF_IMPL(hlog2, static_cast(std::log2), - ULPValidatorBuilderFactory(1)); + ULPValidatorBuilderFactory(1)); MATH_UNARY_HP_KERNEL_DEF(h2log2); @@ -322,7 +322,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2log2); * - HIP_VERSION >= 5.2 */ MATH_UNARY_HP_TEST_DEF_IMPL(h2log2, static_cast(std::log2), - ULPValidatorBuilderFactory(1)); + ULPValidatorBuilderFactory(1)); MATH_UNARY_HP_KERNEL_DEF(hsqrt); @@ -341,7 +341,7 @@ MATH_UNARY_HP_KERNEL_DEF(hsqrt); * - HIP_VERSION >= 5.2 */ MATH_UNARY_HP_TEST_DEF_IMPL(hsqrt, static_cast(std::sqrt), - ULPValidatorBuilderFactory(1)); + ULPValidatorBuilderFactory(1)); MATH_UNARY_HP_KERNEL_DEF(h2sqrt); @@ -359,7 +359,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2sqrt); * - HIP_VERSION >= 5.2 */ MATH_UNARY_HP_TEST_DEF_IMPL(h2sqrt, static_cast(std::sqrt), - ULPValidatorBuilderFactory(1)); + ULPValidatorBuilderFactory(1)); MATH_UNARY_HP_KERNEL_DEF(hceil); diff --git a/projects/hip-tests/catch/unit/math/math_common.hh b/projects/hip-tests/catch/unit/math/math_common.hh index 08bd720816..a92685210f 100644 --- a/projects/hip-tests/catch/unit/math/math_common.hh +++ b/projects/hip-tests/catch/unit/math/math_common.hh @@ -187,7 +187,9 @@ template class MathTest { std::stringstream ss; ss << "Input value(s): " << std::scientific << std::setprecision(std::numeric_limits::max_digits10 - 1); - ((ss << " " << args), ...) << "\n" << actual_val << " "; + ((ss << " " << args), ...) << "\n" + << "Output value: " << actual_val << "\n" + << "Condition failed: "; return ss.str(); } diff --git a/projects/hip-tests/catch/unit/math/validators.hh b/projects/hip-tests/catch/unit/math/validators.hh index 67c49132d4..d62ec5381b 100644 --- a/projects/hip-tests/catch/unit/math/validators.hh +++ b/projects/hip-tests/catch/unit/math/validators.hh @@ -25,6 +25,12 @@ THE SOFTWARE. #include #include +#include +#include +#include + +#include "Float16.hh" + // Define a new MatcherBase class with a public 'describe' member function because // Catch::MatcherBase::describe is protected and thus can't be used via a pointer to // Catch::MatcherBase. @@ -61,6 +67,113 @@ template class ValidatorBase : public MatcherBase bool nan = false; }; +struct Float16WithinUlpsMatcher : MatcherBase { + Float16WithinUlpsMatcher(Float16 target, uint64_t ulps) : m_target(target), m_ulps(ulps) {} + + bool match(Float16 const& matchee) const override { + // Comparison with NaN should always be false. + // This way we can rule it out before getting into the ugly details + if (__hisnan(matchee) || __hisnan(m_target)) { + return false; + } + + auto value_bits = convertFloat16toInt16(matchee); + auto target_bits = convertFloat16toInt16(m_target); + + // If signs differ, handle the special +0 vs -0 case explicitly. + if ((value_bits < 0) != (target_bits < 0)) { + return matchee == m_target; + } + + auto ulp_diff = std::abs(value_bits - target_bits); + return static_cast(ulp_diff) <= m_ulps; + } + + std::string describe() const override { + std::stringstream ret; + + ret << "is within " << m_ulps << " ULPs of "; + + write(ret, m_target); + ret << 'f'; + ret << " (["; + + write(ret, step(m_target, -FLOAT16_MAX, m_ulps)); + ret << ", "; + write(ret, step(m_target, FLOAT16_MAX, m_ulps)); + + ret << "])"; + + return ret.str(); + } + + private: + Float16 getNextAfter(Float16 from, Float16 direction) const { + constexpr int16_t signbit_float16 = 0x8000; + + // Encode inputs as 16-bit integers + const int16_t from_bits = convertFloat16toInt16(from); + const int16_t direction_bits = convertFloat16toInt16(direction); + + // Special cases + if (from_bits == direction_bits) return direction_bits; + if (std::abs(from_bits) == static_cast(0) && + std::abs(direction_bits) == static_cast(0)) + return direction; + + // Makes integer comparisons reflect numeric ordering across sign. + const int16_t from_ordered = (from_bits < 0) ? signbit_float16 - from_bits : from_bits; + const int16_t direction_ordered = + (direction_bits < 0) ? signbit_float16 - direction_bits : direction_bits; + + // Decide whether to move up or down by one ULP + const int16_t step = (from_ordered < direction_ordered) ? 1 : -1; + + // Take one step + const int16_t after_step_ordered = from_ordered + step; + + // Map back from ordered space to raw Float16 bits. + int16_t next_bits = + (after_step_ordered < 0) ? signbit_float16 - after_step_ordered : after_step_ordered; + + // Handle boundary behavior for the most-negative edge case. + if (from_ordered == -1 && (from_ordered < direction_ordered)) { + next_bits = signbit_float16; + } + + return convertInt16toFloat16(next_bits); + } + + Float16 step(Float16 start, Float16 direction, uint64_t steps) const { + Float16 result = start; + for (uint64_t i = 0; i < steps; ++i) { + result = getNextAfter(result, direction); + } + return result; + } + + void write(std::ostream& out, Float16 num) const { + const uint32_t float16_max_digits = 5; + out << std::scientific << std::setprecision(float16_max_digits) << num; + } + + static Float16 convertInt16toFloat16(int16_t d) { + Float16 i; + std::memcpy(&i, &d, sizeof(int16_t)); + return i; + } + + static int16_t convertFloat16toInt16(Float16 d) { + uint16_t i; + std::memcpy(&i, &d, sizeof(Float16)); + return i; + } + + Float16 m_target; + uint64_t m_ulps; +}; + + template auto ULPValidatorBuilderFactory(int64_t ulps) { return [=](T target, auto&&...) { return std::make_unique>( @@ -68,6 +181,13 @@ template auto ULPValidatorBuilderFactory(int64_t ulps) { }; }; +template <> inline auto ULPValidatorBuilderFactory(int64_t ulps) { + return [=](Float16 target, auto&&...) { + return std::make_unique>( + target, Float16WithinUlpsMatcher(target, ulps)); + }; +}; + template auto AbsValidatorBuilderFactory(double margin) { return [=](T target, auto&&...) { return std::make_unique>( @@ -96,7 +216,7 @@ template class EqValidator : public MatcherBase { std::string describe() const override { std::stringstream ss; - ss << " is not equal to " << target_; + ss << "is equal to " << target_; return ss.str(); } diff --git a/projects/hip-tests/catch/unit/memory/hipMemPoolTrimTo.cc b/projects/hip-tests/catch/unit/memory/hipMemPoolTrimTo.cc index dde9bd4aaa..9315b98565 100644 --- a/projects/hip-tests/catch/unit/memory/hipMemPoolTrimTo.cc +++ b/projects/hip-tests/catch/unit/memory/hipMemPoolTrimTo.cc @@ -177,8 +177,8 @@ static bool checkhipMemPoolTrimTo(hipStream_t stream, int N, int dev = 0) { testObj.transferFromMempool(stream); testObj.freeDevBuf(stream); // verify and validate - REQUIRE(true == testObj.validateResult()); HIP_CHECK(hipStreamSynchronize(stream)); + REQUIRE(true == testObj.validateResult()); } HIP_CHECK(hipMemPoolDestroy(mem_pool)); return true; diff --git a/projects/hip/docs/how-to/hip_runtime_api/cooperative_groups.rst b/projects/hip/docs/how-to/hip_runtime_api/cooperative_groups.rst index a3e32cd294..bf34ab7583 100644 --- a/projects/hip/docs/how-to/hip_runtime_api/cooperative_groups.rst +++ b/projects/hip/docs/how-to/hip_runtime_api/cooperative_groups.rst @@ -494,7 +494,6 @@ HIP doesn't support the following CUDA functions/operators in ``cooperative_grou * ``synchronize`` * ``memcpy_async`` * ``wait`` and ``wait_prior`` -* ``barrier_arrive`` and ``barrier_wait`` * ``invoke_one`` and ``invoke_one_broadcast`` * ``reduce`` * ``reduce_update_async`` and ``reduce_store_async`` diff --git a/projects/hip/include/hip/hip_runtime_api.h b/projects/hip/include/hip/hip_runtime_api.h index a7bc73deb1..6f6aee86fe 100644 --- a/projects/hip/include/hip/hip_runtime_api.h +++ b/projects/hip/include/hip/hip_runtime_api.h @@ -9615,6 +9615,45 @@ hipError_t hipDestroySurfaceObject(hipSurfaceObject_t surfaceObject); /** * @} */ + +/** + * @brief Enable HIP runtime logging. + * + * This function enables the HIP runtime logging mechanism, allowing diagnostic + * and trace information to be captured during HIP API execution. + * + * @returns #hipSuccess + * + * @see hipExtDisableLogging, hipExtSetLoggingParams + */ +hipError_t hipExtEnableLogging(); +/** + * @brief Disable HIP runtime logging. + * + * This function disables the HIP runtime logging mechanism, stopping the capture + * of diagnostic and trace information during HIP API execution. + * + * @returns #hipSuccess + * + * @see hipExtEnableLogging, hipExtSetLoggingParams + */ +hipError_t hipExtDisableLogging(); +/** + * @brief Set HIP runtime logging parameters. + * + * This function configures the logging behavior of the HIP runtime, including + * the verbosity level, buffer size, and which components to log. + * + * @param [in] log_level The logging verbosity level. Higher values produce more detailed output. + * @param [in] log_size Reserved for future use. Currently not implemented. + * @param [in] log_mask A bitmask specifying which HIP runtime components to log. + * + * @returns #hipSuccess, #hipErrorInvalidValue + * + * @see hipExtEnableLogging, hipExtDisableLogging + */ +hipError_t hipExtSetLoggingParams(size_t log_level, size_t log_size, size_t log_mask); + #ifdef __cplusplus } /* extern "c" */ #endif diff --git a/projects/rocprofiler-compute/.pre-commit-config.yaml b/projects/rocprofiler-compute/.pre-commit-config.yaml index 2b36616c96..29831ddc66 100644 --- a/projects/rocprofiler-compute/.pre-commit-config.yaml +++ b/projects/rocprofiler-compute/.pre-commit-config.yaml @@ -2,7 +2,7 @@ default_stages: [pre-commit] fail_fast: true repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v5.0.0 + rev: v6.0.0 hooks: - id: check-yaml - id: end-of-file-fixer @@ -12,7 +12,7 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. Check https://github.com/astral-sh/ruff-pre-commit#version-compatibility # for the latest ruff version supported by the hook. - rev: v0.12.12 + rev: v0.14.11 hooks: - id: ruff-check args: [--fix] diff --git a/projects/rocprofiler-compute/CHANGELOG.md b/projects/rocprofiler-compute/CHANGELOG.md index 5b4c63c36a..ed1625615c 100644 --- a/projects/rocprofiler-compute/CHANGELOG.md +++ b/projects/rocprofiler-compute/CHANGELOG.md @@ -49,6 +49,10 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs. * Fix issue where counter collection data was empty when profiling workload which spawn multiple child processes +* Fix issue where dispatch filtering in a range (e.g. >2) was not working + +* Fix redundant warnings for compute/memory partition not found for < MI 300 series GPUs by skipping partition checks + ### Removed * Removed "VL1 Lat" metric for AMD Instinct MI300 series GPUs, due to MI300 series not supporting TCP_TCP_LATENCY_sum counter. diff --git a/projects/rocprofiler-compute/README.md b/projects/rocprofiler-compute/README.md index 89e3f105b9..f913e5ad47 100644 --- a/projects/rocprofiler-compute/README.md +++ b/projects/rocprofiler-compute/README.md @@ -4,14 +4,13 @@ ROCm Compute Profiler is a system performance profiling tool for machine learning/HPC workloads running on AMD MI GPUs. The tool presently -targets usage on MI100, MI200, and MI300 accelerators. +targets usage on MI100, MI200, MI300, and MI350 series accelerators. * For more information on available features, installation steps, and workload profiling and analysis, please refer to the online [documentation](https://rocm.docs.amd.com/projects/rocprofiler-compute/en/latest/). -* ROCm Compute Profiler is an AMD open source research project and is not supported -as part of the ROCm software stack. We welcome contributions and +* ROCm Compute Profiler is an AMD open source tool that is part of the ROCm software stack. We welcome contributions and feedback from the community. Please see the [CONTRIBUTING.md](CONTRIBUTING.md) file for additional details on our contribution process. @@ -39,8 +38,8 @@ python3 -m pip install -r requirements.txt ## Testing -Populate the variable in `docker/docker-compose.customrocmtest.yml`. -Populate the variable in `docker/Dockerfile.customrocmtest` based on latest ROCm CI build information. +Populate the variable in `docker/docker-compose.customrocmtest.yml`. +Populate the variable in `docker/Dockerfile.customrocmtest` based on latest TheRock nightly build information. To quickly get the environment (bash shell) for building and testing, run the following commands: * `cd docker` @@ -115,7 +114,7 @@ This software can be cited using a Zenodo style reference is provided below for convenience: ``` -@software{xiaomin_lu_2022_7314631 +@misc{xiaomin_lu_2022_7314631 author = {Xiaomin Lu and Cole Ramos and Fei Zheng and @@ -124,12 +123,7 @@ style reference is provided below for convenience: Keith Lowery and Nicholas Curtis and Cristian Di Pietrantonio}, - title = {ROCm/rocprofiler-compute: v3.1.0 (12 February 2025)}, - month = February, - year = 2025, - publisher = {Zenodo}, - version = {v3.1.0}, - doi = {10.5281/zenodo.7314631}, - url = {https://doi.org/10.5281/zenodo.7314631} + title = {rocprofiler-compute}, + url = {https://github.com/ROCm/rocm-systems/blob/develop/projects/rocprofiler-compute} } ``` diff --git a/projects/rocprofiler-compute/docker/Dockerfile.customrocmtest b/projects/rocprofiler-compute/docker/Dockerfile.customrocmtest index 47ec351559..020f8b0e4c 100644 --- a/projects/rocprofiler-compute/docker/Dockerfile.customrocmtest +++ b/projects/rocprofiler-compute/docker/Dockerfile.customrocmtest @@ -1,32 +1,52 @@ # Use a base image -FROM +FROM ubuntu:22.04 -# Set the working directory -WORKDIR /app +# Install curl first (needed for ROCm download) +RUN apt-get update && apt-get install -y curl + +# Define the tarball name as a variable +# Check https://therock-nightly-tarball.s3.amazonaws.com/index.html for latest builds +# Use therock-dist-linux-gfx-dcgpu-.tar.gz naming convention +ARG TARBALL_NAME= + +# Install ROCm from TheRock Nightly build +RUN mkdir -p /rocm && \ + curl -fLO https://therock-nightly-tarball.s3.amazonaws.com/${TARBALL_NAME} && \ + tar -xf ${TARBALL_NAME} -C /rocm && \ + rm ${TARBALL_NAME} + +# Set environment variables for ROCm +ENV PATH="/rocm/bin:${PATH}" \ + ROCM_PATH="/rocm" \ + LD_LIBRARY_PATH="/rocm/lib:/rocm/lib/rocm_sysdeps/lib:${LD_LIBRARY_PATH}" \ + HIP_DEVICE_LIB_PATH="/rocm/llvm/amdgcn/bitcode" \ + HIP_PLATFORM=amd # Update package list and install prerequisites RUN apt-get update && apt-get install -y \ - software-properties-common cmake locales git curl \ + software-properties-common cmake locales git \ && add-apt-repository ppa:deadsnakes/ppa \ && apt-get update -# Allows running git commands in /app -RUN git config --global --add safe.directory /app - # Generate the desired locale RUN locale-gen en_US.UTF-8 # Install Python 3.10 and pip RUN apt-get install -y python3.10 python3.10-venv python3.10-dev python3-pip libsqlite3-dev -RUN python3.10 -m venv venv -ENV PATH="venv/bin:$PATH" +RUN python3.10 -m venv /venv +ENV PATH="/venv/bin:$PATH" RUN python -m pip install --upgrade pip -# Install any dependencies specified in requirements.txt -WORKDIR /app/projects/rocprofiler-compute +# Install any rocprofiler-compute dependencies specified in requirements.txt COPY projects/rocprofiler-compute/requirements.txt /app/projects/rocprofiler-compute/requirements.txt COPY projects/rocprofiler-compute/requirements-test.txt /app/projects/rocprofiler-compute/requirements-test.txt -RUN python -m pip install -r requirements.txt -r requirements-test.txt +RUN python -m pip install -r /app/projects/rocprofiler-compute/requirements.txt -r /app/projects/rocprofiler-compute/requirements-test.txt + +# Set the working directory +WORKDIR /app + +# Allows running git commands in /app +RUN git config --global --add safe.directory /app # Run interactive bash shell CMD ["/bin/bash"] diff --git a/projects/rocprofiler-compute/docs/data/analyze/analysis_data_dump_schema.png b/projects/rocprofiler-compute/docs/data/analyze/analysis_data_dump_schema.png index 274b6f0020..24004d3825 100644 Binary files a/projects/rocprofiler-compute/docs/data/analyze/analysis_data_dump_schema.png and b/projects/rocprofiler-compute/docs/data/analyze/analysis_data_dump_schema.png differ diff --git a/projects/rocprofiler-compute/docs/data/analyze/analysis_data_dump_views.png b/projects/rocprofiler-compute/docs/data/analyze/analysis_data_dump_views.png index 954ee506dd..7942dd96a7 100644 Binary files a/projects/rocprofiler-compute/docs/data/analyze/analysis_data_dump_views.png and b/projects/rocprofiler-compute/docs/data/analyze/analysis_data_dump_views.png differ diff --git a/projects/rocprofiler-compute/docs/how-to/analyze/cli.rst b/projects/rocprofiler-compute/docs/how-to/analyze/cli.rst index 984d226aaa..7385f21834 100644 --- a/projects/rocprofiler-compute/docs/how-to/analyze/cli.rst +++ b/projects/rocprofiler-compute/docs/how-to/analyze/cli.rst @@ -15,7 +15,7 @@ This section provides an overview of ROCm Compute Profiler's CLI analysis featur * :ref:`Metric customization `: Isolate a subset of built-in metrics or build your own profiling configuration. * :ref:`Filtering `: Hone in on a particular kernel, GPU ID, or dispatch ID via post-process filtering. - + * :ref:`Per-kernel roofline analysis `: Detailed arithmetic intensity and performance analysis for individual kernels. Run ``rocprof-compute analyze -h`` for more details. @@ -346,6 +346,7 @@ Show System Speed-of-Light and CS_Busy blocks only this case, ``1`` is the ID for System Speed-of-Light and ``5.1.0`` the ID for GPU Busy Cycles metric. + Filter kernels First, list the top kernels in your application using `--list-stats`. @@ -534,36 +535,40 @@ Analysis database example .. code-block:: shell-session - $ rocprof-compute analyze --verbose --db test -p workloads/vmem/MI300X_A1 -p workloads/vmem1/MI300X_A1 + $ rocprof-compute analyze --verbose --output-name test --output-format db -p workloads/nbody/MI300X_A1 -p workloads/nbody1/MI300X_A1 DEBUG Execution mode = analyze - __ _ - _ __ ___ ___ _ __ _ __ ___ / _| ___ ___ _ __ ___ _ __ _ _| |_ ___ - | '__/ _ \ / __| '_ \| '__/ _ \| |_ _____ / __/ _ \| '_ ` _ \| '_ \| | | | __/ _ \ - | | | (_) | (__| |_) | | | (_) | _|_____| (_| (_) | | | | | | |_) | |_| | || __/ - |_| \___/ \___| .__/|_| \___/|_| \___\___/|_| |_| |_| .__/ \__,_|\__\___| - |_| |_| + __ _ + _ __ ___ ___ _ __ _ __ ___ / _| ___ ___ _ __ ___ _ __ _ _| |_ ___ +| '__/ _ \ / __| '_ \| '__/ _ \| |_ _____ / __/ _ \| '_ ` _ \| '_ \| | | | __/ _ \ +| | | (_) | (__| |_) | | | (_) | _|_____| (_| (_) | | | | | | |_) | |_| | || __/ +|_| \___/ \___| .__/|_| \___/|_| \___\___/|_| |_| |_| .__/ \__,_|\__\___| + |_| |_| - INFO Analysis mode = db - DEBUG [omnisoc init] - DEBUG [omnisoc init] - DEBUG [analysis] prepping to do some analysis - INFO [analysis] deriving rocprofiler-compute metrics... - WARNING Roofline ceilings not found for /app/projects/rocprofiler-compute/workloads/vmem/MI300X_A1. - WARNING Roofline ceilings not found for /app/projects/rocprofiler-compute/workloads/vmem1/MI300X_A1. - WARNING PC sampling data not found for /app/projects/rocprofiler-compute/workloads/vmem/MI300X_A1. - WARNING PC sampling data not found for /app/projects/rocprofiler-compute/workloads/vmem1/MI300X_A1. - DEBUG Collected dispatch data - DEBUG Applied analysis mode filters - DEBUG Calculated dispatch data - DEBUG Collected metrics data - WARNING Failed to evaluate expression for 3.1.39 - Value: to_round((to_avg( + INFO Analysis mode = db + INFO ed45b0b189 + DEBUG [omnisoc init] + INFO ed45b0b189 + DEBUG [omnisoc init] + DEBUG [analysis] prepping to do some analysis + INFO [analysis] deriving rocprofiler-compute metrics... + DEBUG Collected roofline ceilings +WARNING PC sampling data not found for /app/projects/rocprofiler-compute/workloads/nbody/MI300X_A1. +WARNING PC sampling data not found for /app/projects/rocprofiler-compute/workloads/nbody1/MI300X_A1. + DEBUG Collected dispatch data + DEBUG Applied analysis mode filters + DEBUG Calculated dispatch data + DEBUG Collected metrics data +WARNING Failed to evaluate expression for 3.1.39 - Value: to_round((to_avg( (pmc_df.get("pmc_perf_ACCUM") / pmc_df.get("SQC_ICACHE_REQ")).where((pmc_df.get("SQC_ICACHE_REQ") != 0), None)) * 100), 0) - unsupported operand type(s) for /: 'NoneType' and 'float' - WARNING Failed to evaluate expression for 3.1.39 - Value: to_round((to_avg( +WARNING Failed to evaluate expression for 3.1.39 - Value: to_round((to_avg( (pmc_df.get("pmc_perf_ACCUM") / pmc_df.get("SQC_ICACHE_REQ")).where((pmc_df.get("SQC_ICACHE_REQ") != 0), None)) * 100), 0) - unsupported operand type(s) for /: 'NoneType' and 'float' - DEBUG Calculated metric values - DEBUG Calculated roofline data points - DEBUG [analysis] generating analysis - DEBUG SQLite database initialized with name: test.db - DEBUG Initialized database: test.db - DEBUG Completed writing database \ No newline at end of file + DEBUG Calculated metric values + DEBUG Calculated roofline data points + DEBUG [analysis] generating analysis + DEBUG SQLite database initialized with name: test.db + DEBUG Initialized database: test.db + INFO ed45b0b189 + INFO ed45b0b189 + DEBUG Completed writing database +WARNING Created file: test.db diff --git a/projects/rocprofiler-compute/docs/how-to/analyze/standalone-gui.rst b/projects/rocprofiler-compute/docs/how-to/analyze/standalone-gui.rst index e7cedb3fa1..41b2176411 100644 --- a/projects/rocprofiler-compute/docs/how-to/analyze/standalone-gui.rst +++ b/projects/rocprofiler-compute/docs/how-to/analyze/standalone-gui.rst @@ -28,7 +28,9 @@ Launch the standalone GUI analyzer ---------------------------------- To launch the ROCm Compute Profiler GUI analyzer, include the ``--gui`` flag with your -desired analysis command. For example: +desired analysis command. + +For example: .. code-block:: shell-session diff --git a/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_base.py b/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_base.py index 7cfe42403d..b4edad07fd 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_base.py +++ b/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_base.py @@ -386,26 +386,24 @@ class OmniAnalyze_Base: sys.exit(0) # Ensure analysis output does not overwrite existing files - if not args.output_name: - return + if args.output_name: + if not re.match(r"^[A-Za-z0-9_-]+$", args.output_name): + console_error( + "analysis", + "Analysis output file/folder name must " + "contain only alphanumeric characters " + "or underscores (_), hyphens (-).", + ) - if not re.match(r"^[A-Za-z0-9_-]+$", args.output_name): - console_error( - "analysis", - "Analysis output file/folder name must " - "contain only alphanumeric characters " - "or underscores (_), hyphens (-).", - ) + path_to_check = args.output_name + if args.output_format in ("txt", "db"): + path_to_check += f".{args.output_format}" - path_to_check = args.output_name - if args.output_format in ("txt", "db"): - path_to_check += f".{args.output_format}" - - if Path(path_to_check).exists(): - console_error( - f"Analysis output file/folder {path_to_check} already exists. " - "Please choose a different name." - ) + if Path(path_to_check).exists(): + console_error( + f"Analysis output file/folder {path_to_check} already exists. " + "Please choose a different name." + ) # Check if any kernel's counters are missing due to iteration multiplexing if ( diff --git a/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_db.py b/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_db.py index 4a121499f7..1a01b23a55 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_db.py +++ b/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_db.py @@ -101,7 +101,9 @@ class db_analysis(OmniAnalyze_Base): Database.init(db_name) console_debug(f"Initialized database: {db_name}") + # Iterate over all workloads for workload_path in self._runs.keys(): + # Add workload workload_obj = orm.Workload( name=workload_path.split("/")[-2], sub_name=workload_path.split("/")[-1], @@ -113,38 +115,9 @@ class db_analysis(OmniAnalyze_Base): ) Database.get_session().add(workload_obj) - for pc_sample in self._pc_sampling_data_per_workload.get( - workload_path, pd.DataFrame() - ).itertuples(): - Database.get_session().add( - orm.PCsampling( - source=pc_sample.source_line, - instruction=pc_sample.instruction, - count=pc_sample.count, - kernel_name=pc_sample.kernel_name, - offset=pc_sample.offset, - count_issue=pc_sample.count_issued, - count_stall=pc_sample.count_stalled, - stall_reason=pc_sample.stall_reason, - workload=workload_obj, - ) - ) - - for roofline_data in self._roofline_data_per_workload.get( - workload_path, pd.DataFrame() - ).itertuples(): - Database.get_session().add( - orm.RooflineData( - kernel_name=roofline_data.kernel_name, - total_flops=roofline_data.total_flops, - l1_cache_data=roofline_data.l1_cache_data, - l2_cache_data=roofline_data.l2_cache_data, - hbm_cache_data=roofline_data.hbm_cache_data, - workload=workload_obj, - ) - ) - + # Add kernel kernel_objs: dict[str, orm.Kernel] = {} + for dispatch in self._dispatch_data_per_workload.get( workload_path, pd.DataFrame() ).itertuples(): @@ -167,44 +140,101 @@ class db_analysis(OmniAnalyze_Base): ) ) - # Optimize: Pre-group values by (metric_id, kernel_name) for O(1) lookups - values_df = self._values_data_per_workload.get( - workload_path, pd.DataFrame() - ) - values_grouped = {} - if not values_df.empty: - for value in values_df.itertuples(): - key = (value.metric_id, value.kernel_name) - if key not in values_grouped: - values_grouped[key] = [] - values_grouped[key].append(value) - - for metric in self._metrics_info_data_per_workload.get( + # Add roofline data points + for roofline_data in self._roofline_data_per_workload.get( workload_path, pd.DataFrame() ).itertuples(): - for kernel_name in kernel_objs.keys(): - metric_obj = orm.Metric( - name=metric.name, - metric_id=metric.metric_id, - description=metric.description, - unit=metric.unit, - table_name=metric.table_name, - sub_table_name=metric.sub_table_name, - kernel=kernel_objs[kernel_name], + if roofline_data.kernel_name not in kernel_objs: + console_warning( + f"Kernel {roofline_data.kernel_name} from roofline data " + "not found in dispatch data. Skipping roofline entry." ) - Database.get_session().add(metric_obj) + continue + Database.get_session().add( + orm.RooflineData( + total_flops=roofline_data.total_flops, + l1_cache_data=roofline_data.l1_cache_data, + l2_cache_data=roofline_data.l2_cache_data, + hbm_cache_data=roofline_data.hbm_cache_data, + kernel=kernel_objs[roofline_data.kernel_name], + ) + ) - # Direct lookup instead of iterating through all values - key = (metric.metric_id, kernel_name) - for value in values_grouped.get(key, []): - Database.get_session().add( - orm.Value( - metric=metric_obj, - value_name=value.value_name, - value=value.value, - ) + # Add pc sampling data + for pc_sample in self._pc_sampling_data_per_workload.get( + workload_path, pd.DataFrame() + ).itertuples(): + if pc_sample.kernel_name not in kernel_objs: + console_warning( + f"Kernel {pc_sample.kernel_name} from PC sampling data " + "not found in dispatch data. Skipping PC sampling entry." + ) + continue + Database.get_session().add( + orm.PCsampling( + source=pc_sample.source_line, + instruction=pc_sample.instruction, + count=pc_sample.count, + offset=pc_sample.offset, + count_issue=pc_sample.count_issued, + count_stall=pc_sample.count_stalled, + stall_reason=pc_sample.stall_reason, + kernel=kernel_objs[pc_sample.kernel_name], + ) + ) + + # Add metrics and values - iterate on values, create metrics as needed + metrics_info_dict = { + row.metric_id: row + for row in self._metrics_info_data_per_workload.get( + workload_path, pd.DataFrame() + ).itertuples() + } + metric_objs: dict[str, orm.MetricDefinition] = {} + + for value in self._values_data_per_workload.get( + workload_path, pd.DataFrame() + ).itertuples(): + # Check if kernel exists + if value.kernel_name not in kernel_objs: + console_warning( + f"Kernel {value.kernel_name} from values data " + "not found in dispatch data. Skipping metric value." + ) + continue + + # Create or reuse metric object + if value.metric_id not in metric_objs: + # Fetch metric info + if value.metric_id not in metrics_info_dict: + console_warning( + f"Metric {value.metric_id} from values data " + "not found in metrics info. Skipping metric value." ) + continue + metric_info = metrics_info_dict[value.metric_id] + metric_objs[value.metric_id] = orm.MetricDefinition( + name=metric_info.name, + metric_id=metric_info.metric_id, + description=metric_info.description, + unit=metric_info.unit, + table_name=metric_info.table_name, + sub_table_name=metric_info.sub_table_name, + workload=workload_obj, + ) + Database.get_session().add(metric_objs[value.metric_id]) + # Add value + Database.get_session().add( + orm.MetricValue( + metric=metric_objs[value.metric_id], + kernel=kernel_objs[value.kernel_name], + value_name=value.value_name, + value=value.value, + ) + ) + + # Add metadata version = get_version(rocprof_compute_home) Database.get_session().add( orm.Metadata( diff --git a/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_webui.py b/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_webui.py index 40df31abc4..745b7febd0 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_webui.py +++ b/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_webui.py @@ -51,7 +51,6 @@ class webui_analysis(OmniAnalyze_Base): self.app = dash.Dash( __name__, title=PROJECT_NAME, external_stylesheets=[dbc.themes.CYBORG] ) - self.dest_dir = str(Path(args.path[0][0]).absolute().resolve()) self.arch: Optional[str] = None self.__hidden_sections = ["Memory Chart"] @@ -90,6 +89,7 @@ class webui_analysis(OmniAnalyze_Base): kernel_top_df = base_data.dfs[1] for kernel_id in base_data.filter_kernel_ids: filt_kernel_names.append(str(kernel_top_df.loc[kernel_id, "Kernel_Name"])) + input_filters["kernel"] = filt_kernel_names # setup app layout from utils.gui_components.header import get_header @@ -338,6 +338,7 @@ class webui_analysis(OmniAnalyze_Base): ) args = self.get_args() + self.dest_dir = str(Path(args.path[0][0]).absolute().resolve()) # create 'mega dataframe' self._runs[self.dest_dir].raw_pmc = file_io.create_df_pmc( diff --git a/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py b/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py index 9bae23bf14..a79d188c75 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py +++ b/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py @@ -531,12 +531,15 @@ class RocProfCompute_Base: and not args.attach_pid ): # Use native counter collection tool + # Use lib* glob pattern to handle CMAKE_INSTALL_LIBDIR variations + # (lib, lib64, lib32, etc. depending on distribution) + native_tool_base_path = Path(sys.argv[0]).resolve().parents[2] + native_tool_glob_pattern = ( + "lib*/rocprofiler-compute/librocprofiler-compute-tool.so" + ) try: native_tool_path = str( - Path(sys.argv[0]).resolve().parents[2] - / "lib" - / "rocprofiler-compute" - / "librocprofiler-compute-tool.so" + next(native_tool_base_path.glob(native_tool_glob_pattern)) ) except Exception as e: console_debug( @@ -552,6 +555,7 @@ class RocProfCompute_Base: ) / "librocprofiler-compute-tool.so" ) + native_tool_cpp_path = Path(__file__).resolve().parents[1] / "lib" link_libraries = ("rocprofiler-sdk",) build_command = ( # Create shared object @@ -564,10 +568,10 @@ class RocProfCompute_Base: # rocprofiler sdk library path f"-L {str(Path(args.rocprofiler_sdk_tool_path).parent.parent)} " # native tool source files (tool.cpp and helper.cpp) - f"{str(Path(__file__).parent.parent)}/" - "lib/rocprofiler_compute_tool.cpp " - f"{str(Path(__file__).parent.parent)}/" - "lib/helper.cpp " + f"{native_tool_cpp_path}/" + "rocprofiler_compute_tool.cpp " + f"{native_tool_cpp_path}/" + "helper.cpp " # temporary shared object for native tool f"-o {native_tool_path}" ) @@ -575,7 +579,15 @@ class RocProfCompute_Base: success, output = capture_subprocess_output(shlex.split(build_command)) console_debug(f"Build output: {output}") if not success: - console_error("Failed to build native counter collection tool.") + console_error( + "Failed to use native counter collection tool.\n" + "Could not find pre-built .so file at: " + f"{native_tool_base_path / native_tool_glob_pattern}\n" + "Could not find source .cpp files in folder: " + f"{native_tool_cpp_path}\n" + "Please ensure the native tool library is installed " + "or source files are present." + ) if self.__profiler == "rocprofiler-sdk": options = self.get_profiler_options(native_tool_path=native_tool_path) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_rocprofiler_sdk.py b/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_rocprofiler_sdk.py index f976c99d0d..542ec04619 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_rocprofiler_sdk.py +++ b/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_rocprofiler_sdk.py @@ -92,7 +92,6 @@ class rocprofiler_sdk_profiler(RocProfCompute_Base): / "librocprofiler-sdk-rocattach.so" ) options.update({ - "ROCPROF_ATTACH_TOOL_LIBRARY": rocprofiler_sdk_tool_path, "ROCPROF_ATTACH_LIBRARY": rocprofiler_attach_library_path, "ROCPROF_ATTACH_PID": args.attach_pid, }) diff --git a/projects/rocprofiler-compute/src/utils/analysis_orm.py b/projects/rocprofiler-compute/src/utils/analysis_orm.py index f2315fc768..76a9ff3c08 100644 --- a/projects/rocprofiler-compute/src/utils/analysis_orm.py +++ b/projects/rocprofiler-compute/src/utils/analysis_orm.py @@ -45,7 +45,7 @@ from sqlalchemy.sql import Select from utils.logger import console_debug, console_error PREFIX = "compute_" -SCHEMA_VERSION = "1.1.0" +SCHEMA_VERSION = "1.2.0" Base = declarative_base() @@ -63,18 +63,16 @@ class Workload(Base): # Workload can have multiple kernels kernels = relationship("Kernel", back_populates="workload") - # Workload can have multiple roofline data points - roofline_data_points = relationship("RooflineData", back_populates="workload") - # Workload can have multiple pc_sampling values - pc_sampling_values = relationship("PCsampling", back_populates="workload") + # Workload can have multiple metric definitions + metric_definitions = relationship("MetricDefinition", back_populates="workload") -class Metric(Base): - __tablename__ = f"{PREFIX}metric" +class MetricDefinition(Base): + __tablename__ = f"{PREFIX}metric_definition" metric_uuid = Column(Integer, primary_key=True) - kernel_uuid = Column( - Integer, ForeignKey(f"{PREFIX}kernel.kernel_uuid"), nullable=False + workload_id = Column( + Integer, ForeignKey(f"{PREFIX}workload.workload_id"), nullable=False ) name = Column(String) # e.g. Wavefronts Num metric_id = Column(String) # e.g. 4.1.3 @@ -83,27 +81,26 @@ class Metric(Base): sub_table_name = Column(String) # e.g. Wavefront stats unit = Column(String) # e.g. Gbps - # Metric can have one kernel - kernel = relationship("Kernel", back_populates="metrics") - # Metric can have multiple values - values = relationship("Value", back_populates="metric") + # Metric can have one workload + workload = relationship("Workload", back_populates="metric_definitions") + # Metric can have multiple metric values + metric_values = relationship("MetricValue", back_populates="metric") class RooflineData(Base): __tablename__ = f"{PREFIX}roofline_data" roofline_uuid = Column(Integer, primary_key=True) - workload_id = Column( - Integer, ForeignKey(f"{PREFIX}workload.workload_id"), nullable=False + kernel_uuid = Column( + Integer, ForeignKey(f"{PREFIX}kernel.kernel_uuid"), nullable=False ) - kernel_name = Column(String) total_flops = Column(Float) l1_cache_data = Column(Float) l2_cache_data = Column(Float) hbm_cache_data = Column(Float) - # Roofline data point can have one workload - workload = relationship("Workload", back_populates="roofline_data_points") + # Roofline data point can have one kernel + kernel = relationship("Kernel", back_populates="roofline_data_points") class Dispatch(Base): @@ -135,42 +132,50 @@ class Kernel(Base): workload = relationship("Workload", back_populates="kernels") # Kernel can have multiple dispatches dispatches = relationship("Dispatch", back_populates="kernel") - # Kernel can have multiple metrics - metrics = relationship("Metric", back_populates="kernel") + # Kernel can have multiple metric values + metric_values = relationship("MetricValue", back_populates="kernel") + # Kernel can have multiple roofline data points + roofline_data_points = relationship("RooflineData", back_populates="kernel") + # Kernel can have multiple pc_sampling values + pc_sampling_values = relationship("PCsampling", back_populates="kernel") class PCsampling(Base): __tablename__ = f"{PREFIX}pcsampling" pc_sampling_uuid = Column(Integer, primary_key=True) - workload_id = Column( - Integer, ForeignKey(f"{PREFIX}workload.workload_id"), nullable=False + kernel_uuid = Column( + Integer, ForeignKey(f"{PREFIX}kernel.kernel_uuid"), nullable=False ) source = Column(String) instruction = Column(String) count = Column(Integer) - kernel_name = Column(String) offset = Column(Integer) count_issue = Column(Integer) count_stall = Column(Integer) stall_reason = Column(JSON) - # PCsampling can have one workload - workload = relationship("Workload", back_populates="pc_sampling_values") + # PCsampling can have one kernel + kernel = relationship("Kernel", back_populates="pc_sampling_values") -class Value(Base): - __tablename__ = f"{PREFIX}value" +class MetricValue(Base): + __tablename__ = f"{PREFIX}metric_value" value_uuid = Column(Integer, primary_key=True) metric_uuid = Column( - Integer, ForeignKey(f"{PREFIX}metric.metric_uuid"), nullable=False + Integer, ForeignKey(f"{PREFIX}metric_definition.metric_uuid"), nullable=False + ) + kernel_uuid = Column( + Integer, ForeignKey(f"{PREFIX}kernel.kernel_uuid"), nullable=False ) value_name = Column(String) # e.g. min, max, avg value = Column(Float) # e.g. 123.45 # Value can have one metric - metric = relationship("Metric", back_populates="values") + metric = relationship("MetricDefinition", back_populates="metric_values") + # Value can have one kernel + kernel = relationship("Kernel", back_populates="metric_values") class Metadata(Base): @@ -250,11 +255,20 @@ def get_views() -> list[TextClause]: views: dict[str, Select[Any]] = { "kernel_view": select( + Kernel.kernel_uuid.label("kernel_uuid"), + Kernel.workload_id.label("workload_id"), + Workload.name.label("workload_name"), Kernel.kernel_name, func.count(Dispatch.dispatch_id).label("dispatch_count"), func.sum(Dispatch.end_timestamp - Dispatch.start_timestamp).label( "duration_ns_sum" ), + func.min(Dispatch.end_timestamp - Dispatch.start_timestamp).label( + "duration_ns_min" + ), + func.max(Dispatch.end_timestamp - Dispatch.start_timestamp).label( + "duration_ns_max" + ), median_calc.c.duration_ns_median, func.avg(Dispatch.end_timestamp - Dispatch.start_timestamp).label( "duration_ns_mean" @@ -262,24 +276,31 @@ def get_views() -> list[TextClause]: ) .select_from(Dispatch) .join(Kernel, Dispatch.kernel_uuid == Kernel.kernel_uuid) + .join(Workload, Kernel.workload_id == Workload.workload_id) .join(median_calc.subquery(), Kernel.kernel_name == median_calc.c.kernel_name) - .group_by(Kernel.kernel_name), + .group_by( + Kernel.kernel_uuid, Kernel.workload_id, Workload.name, Kernel.kernel_name + ), "metric_view": select( + Workload.workload_id.label("workload_id"), Workload.name.label("workload_name"), + Kernel.kernel_uuid.label("kernel_uuid"), Kernel.kernel_name, - Metric.name.label("metric_name"), - Metric.metric_id, - Metric.description, - Metric.table_name, - Metric.sub_table_name, - Metric.unit, - Value.value_name, - Value.value, + MetricDefinition.metric_uuid.label("metric_uuid"), + MetricDefinition.name.label("metric_name"), + MetricDefinition.metric_id, + MetricDefinition.description, + MetricDefinition.table_name, + MetricDefinition.sub_table_name, + MetricDefinition.unit, + MetricValue.value_uuid.label("value_uuid"), + MetricValue.value_name, + MetricValue.value, ) - .select_from(Metric) - .join(Kernel, Metric.kernel_uuid == Kernel.kernel_uuid) - .join(Value, Metric.metric_uuid == Value.metric_uuid) - .join(Workload, Kernel.workload_id == Workload.workload_id), + .select_from(MetricDefinition) + .join(Workload, MetricDefinition.workload_id == Workload.workload_id) + .join(MetricValue, MetricDefinition.metric_uuid == MetricValue.metric_uuid) + .join(Kernel, MetricValue.kernel_uuid == Kernel.kernel_uuid), } return [ diff --git a/projects/rocprofiler-compute/src/utils/gui.py b/projects/rocprofiler-compute/src/utils/gui.py index a03ab297b4..2708365515 100644 --- a/projects/rocprofiler-compute/src/utils/gui.py +++ b/projects/rocprofiler-compute/src/utils/gui.py @@ -61,7 +61,7 @@ def multi_bar_chart( def create_instruction_mix_bar_chart(display_df: pd.DataFrame, df_unit: str) -> px.bar: display_df = display_df.copy() - display_df["Avg"] = display_df["Avg"].apply(lambda x: int(x) if x != "" else 0) + display_df["Avg"] = display_df["Avg"].apply(lambda x: int(x) if x != "N/A" else 0) return px.bar( display_df, @@ -78,7 +78,7 @@ def create_multi_bar_charts( display_df: pd.DataFrame, table_id: int, df_unit: str ) -> list[px.bar]: display_df = display_df.copy() - display_df["Avg"] = display_df["Avg"].apply(lambda x: int(x) if x != "" else 0) + display_df["Avg"] = display_df["Avg"].apply(lambda x: int(x) if x != "N/A" else 0) nested_bar = multi_bar_chart(table_id, display_df) charts = [] @@ -103,7 +103,9 @@ def create_multi_bar_charts( def create_sol_charts(display_df: pd.DataFrame, table_id: int) -> list[px.bar]: display_df = display_df.copy() - display_df["Avg"] = display_df["Avg"].apply(lambda x: float(x) if x != "" else 0.0) + display_df["Avg"] = display_df["Avg"].apply( + lambda x: float(x) if x != "N/A" else 0.0 + ) charts = [] @@ -144,7 +146,7 @@ def create_sol_charts(display_df: pd.DataFrame, table_id: int) -> list[px.bar]: elif table_id == 1101: # Special formatting reference 'Pct of Peak' value display_df["Pct of Peak"] = display_df["Pct of Peak"].apply( - lambda x: float(x) if x != "" else 0.0 + lambda x: float(x) if x != "N/A" else 0.0 ) charts.append( px.bar( diff --git a/projects/rocprofiler-compute/src/utils/parser.py b/projects/rocprofiler-compute/src/utils/parser.py index 4ae6c9759a..68b56396c5 100755 --- a/projects/rocprofiler-compute/src/utils/parser.py +++ b/projects/rocprofiler-compute/src/utils/parser.py @@ -1290,6 +1290,8 @@ def apply_dispatch_filter(df: pd.DataFrame, workload: schema.Workload) -> pd.Dat # NB: support ignoring the 1st n dispatched execution by '> n' # The better way may be parsing python slice string for dispatch_id in workload.filter_dispatch_ids: + if isinstance(dispatch_id, str) and ">" in dispatch_id: + dispatch_id = re.match(r"\>\s*(\d+)", dispatch_id).group(1) if int(dispatch_id) >= len(df): # subtract 2 bc of the two header rows console_error("analysis", f"{dispatch_id} is an invalid dispatch id.") @@ -1297,7 +1299,7 @@ def apply_dispatch_filter(df: pd.DataFrame, workload: schema.Workload) -> pd.Dat isinstance(workload.filter_dispatch_ids[0], str) and ">" in workload.filter_dispatch_ids[0] ): - dispatch_match = re.match(r"\> (\d+)", workload.filter_dispatch_ids[0]) + dispatch_match = re.match(r"\>\s*(\d+)", workload.filter_dispatch_ids[0]) df = df[ df[schema.PMC_PERF_FILE_PREFIX]["Dispatch_ID"] > int(dispatch_match.group(1)) diff --git a/projects/rocprofiler-compute/src/utils/specs.py b/projects/rocprofiler-compute/src/utils/specs.py index b9202deb73..e55335686f 100644 --- a/projects/rocprofiler-compute/src/utils/specs.py +++ b/projects/rocprofiler-compute/src/utils/specs.py @@ -174,15 +174,15 @@ def generate_machine_specs( ########################################## machine_info = extract_machine_info() - # FIXME: use device - # Load amd-smi data - gpu_info = extract_gpu_info() - ########################################## ## B. SoC Specs ########################################## soc_info = extract_soc_info() + # FIXME: use device + # Load amd-smi data + gpu_info = extract_gpu_info(gpu_arch=soc_info["gpu_arch"]) + # Combine all specifications with amdsmi_ctx(): specs = MachineSpecs( @@ -269,7 +269,16 @@ def extract_machine_info() -> dict[str, Any]: @demarcate -def extract_gpu_info() -> dict[str, Any]: +def extract_gpu_info(gpu_arch: Optional[str]) -> dict[str, Any]: + # Partition is only supported on >= MI 300 series + # (gpu_arch should be gfx940 or higher for MI300+) + is_partition_supported = False + if gpu_arch and gpu_arch.startswith("gfx") and len(gpu_arch) >= 6: + try: + is_partition_supported = int(gpu_arch[3:6], 16) >= 0x940 + except ValueError: + pass # Invalid hex string, keep is_partition_supported as False + result: dict[str, Optional[str]] = { "vbios": None, "compute_partition": None, @@ -278,17 +287,22 @@ def extract_gpu_info() -> dict[str, Any]: with amdsmi_ctx(): result["vbios"] = get_gpu_vbios_part_number() - result["compute_partition"] = get_gpu_compute_partition() - result["memory_partition"] = get_gpu_memory_partition() + if is_partition_supported: + result["compute_partition"] = get_gpu_compute_partition() + result["memory_partition"] = get_gpu_memory_partition() + else: + result["compute_partition"] = "N/A" + result["memory_partition"] = "N/A" # Apply defaults and warnings - if result["compute_partition"] == "N/A" or not result["compute_partition"]: - console_warning("Cannot detect accelerator partition from amd-smi.") - console_warning("Applying default accelerator partition: SPX") - result["compute_partition"] = "SPX" + if is_partition_supported: + if result["compute_partition"] == "N/A" or not result["compute_partition"]: + console_warning("Cannot detect accelerator partition from amd-smi.") + console_warning("Applying default accelerator partition: SPX") + result["compute_partition"] = "SPX" - if result["memory_partition"] == "N/A" or not result["memory_partition"]: - console_warning("Cannot detect memory partition from amd-smi.") + if result["memory_partition"] == "N/A" or not result["memory_partition"]: + console_warning("Cannot detect memory partition from amd-smi.") console_debug( f"vbios is {result['vbios']}, compute partition is " diff --git a/projects/rocprofiler-compute/src/utils/utils.py b/projects/rocprofiler-compute/src/utils/utils.py index c85471edae..6d51d972ff 100644 --- a/projects/rocprofiler-compute/src/utils/utils.py +++ b/projects/rocprofiler-compute/src/utils/utils.py @@ -235,6 +235,101 @@ def detect_rocprof(args: argparse.Namespace) -> str: return rocprof_cmd +def perform_attach_detach(new_env: dict[str, str], options: dict[str, Any]) -> None: + @contextmanager + def temporary_env(env_vars: dict[str, str]) -> Generator[None, None, None]: + """ + Temporarily change the environment variable of this application. + """ + original_env = os.environ.copy() + os.environ.update({k: str(v) for k, v in env_vars.items()}) + try: + yield + finally: + os.environ.clear() + os.environ.update(original_env) + + with temporary_env(new_env): + libname = options["ROCPROF_ATTACH_LIBRARY"] + + try: + c_lib = ctypes.CDLL(libname) + if c_lib is None: + console_error(f"Error opening {libname}") + except Exception as e: + console_error(f"Error loading {libname}: {e}") + + # Set argument and return types for attach/detach functions + try: + # old attach/detach API + c_lib.attach.argtypes = [ctypes.c_uint] + except Exception as e: + console_debug( + "Error setting old attach/detach API argument " + f"types: {e}, trying new API" + ) + try: + # new attach/detach API + c_lib.rocattach_attach.restype = ctypes.c_int + c_lib.rocattach_attach.argtypes = [ctypes.c_int] + c_lib.rocattach_detach.restype = ctypes.c_int + c_lib.rocattach_detach.argtypes = [ctypes.c_int] + except Exception as e: + console_error( + f"Error setting attach/detach function argument types: {e}" + ) + + pid = options["ROCPROF_ATTACH_PID"] + if pid is None: + console_error("Mode of attach/detach must have setup for process ID") + + try: + # old attach/detach API + c_lib.attach(int(pid)) + except Exception as e: + console_debug(f"Error attaching with old API: {e}, trying new API") + try: + # new attach/detach API + attach_status = c_lib.rocattach_attach(int(pid)) + if attach_status != 0: + console_error( + f"Error attaching to process {pid}, " + f"rocattach_attach returned {attach_status}" + ) + except Exception as e: + console_error(f"Error attaching to process {pid}: {e}") + + duration = os.environ.get("ROCPROF_ATTACH_DURATION", None) + if duration is None: + console_log( + f"\033[93mAttach to process with ID {pid} is successful, " + "Press Enter to detach...\033[0m" + ) + input() + else: + console_log( + f"\033[93mAttach to process with ID {pid} is successful, " + f"detach will happen in {duration} milliseconds...\033[0m" + ) + time.sleep(int(duration) / 1000) + + try: + # old attach/detach API + c_lib.detach(int(pid)) + except Exception as e: + console_debug(f"Error detaching with old API: {e}, trying new API") + try: + # new attach/detach API + detach_status = c_lib.rocattach_detach(int(pid)) + if detach_status != 0: + console_error( + f"Error detaching from process {pid}, " + f"rocattach_detach returned {detach_status}" + ) + except Exception as e: + console_error(f"Error detaching from process {pid}: {e}") + + def capture_subprocess_output( subprocess_args: list[str], new_env: Optional[dict[str, str]] = None, @@ -788,49 +883,7 @@ def run_prof( console_debug(f"rocprof sdk env vars: {new_env}") if is_mode_live_attach: - - @contextmanager - def temporary_env(env_vars: dict[str, str]) -> Generator[None, None, None]: - """ - Temporarily change the environment variable of this application. - """ - original_env = os.environ.copy() - os.environ.update({k: str(v) for k, v in env_vars.items()}) - try: - yield - finally: - os.environ.clear() - os.environ.update(original_env) - - with temporary_env(new_env): - libname = options["ROCPROF_ATTACH_LIBRARY"] - c_lib = ctypes.CDLL(libname) - if c_lib is None: - console_error(f"Error opening {libname}") - c_lib.attach.argtypes = [ctypes.c_uint] - - pid = options["ROCPROF_ATTACH_PID"] - if pid is None: - console_error( - "Mode of attach/detach must have setup for process ID" - ) - - c_lib.attach(int(pid)) - duration = os.environ.get("ROCPROF_ATTACH_DURATION", None) - if duration is None: - console_log( - f"\033[93mAttach to process with ID {pid} is successful, " - "Press Enter to detach...\033[0m" - ) - input() - else: - console_log( - f"\033[93mAttach to process with ID {pid} is successful, " - f"detach will happen in {duration} milliseconds...\033[0m" - ) - time.sleep(int(duration) / 1000) - c_lib.detach(int(pid)) - + perform_attach_detach(new_env, options) else: if app_cmd is None: console_error( diff --git a/projects/rocprofiler-compute/tests/test_profile_general.py b/projects/rocprofiler-compute/tests/test_profile_general.py index 4b685e6dde..fcaf05a130 100644 --- a/projects/rocprofiler-compute/tests/test_profile_general.py +++ b/projects/rocprofiler-compute/tests/test_profile_general.py @@ -29,6 +29,7 @@ import re import sqlite3 import subprocess import sys +import time from pathlib import Path import numpy as np @@ -989,19 +990,19 @@ def test_analyze_rocpd( Dispatch, Kernel, Metadata, - Metric, + MetricDefinition, + MetricValue, RooflineData, - Value, Workload, ) table_name_map = { "compute_workload": Workload, - "compute_metric": Metric, + "compute_metric_definition": MetricDefinition, "compute_roofline_data": RooflineData, "compute_dispatch": Dispatch, "compute_kernel": Kernel, - "compute_value": Value, + "compute_metric_value": MetricValue, "compute_metadata": Metadata, } @@ -2268,6 +2269,7 @@ def test_live_attach_detach_block(binary_handler_profile_rocprof_compute): try: # Start workload process_workload = subprocess.Popen(config["app_hip_dynamic_shared"], env=env) + time.sleep(5) # Give workload time to start attach_detach = { "attach_pid": process_workload.pid, @@ -2316,8 +2318,9 @@ def test_live_attach_detach_block_thread_sleep(binary_handler_profile_rocprof_co try: # Start workload with sleep mode enabled process_workload = subprocess.Popen( - [config["app_hip_dynamic_shared"], "--enable-sleep"], env=env + [*config["app_hip_dynamic_shared"], "--enable-sleep"], env=env ) + time.sleep(5) # Give workload time to start attach_detach = { "attach_pid": process_workload.pid, @@ -2358,7 +2361,7 @@ def test_live_attach_detach_block_thread_sleep(binary_handler_profile_rocprof_co @pytest.mark.live_attach_detach -def test_live_attach_detach_singlepath_launch_stats( +def test_live_attach_detach_singlepass_launch_stats( binary_handler_profile_rocprof_compute, ): options = ["--set", "launch_stats"] @@ -2374,6 +2377,7 @@ def test_live_attach_detach_singlepath_launch_stats( try: # Start workload process_workload = subprocess.Popen(config["app_hip_dynamic_shared"], env=env) + time.sleep(5) # Give workload time to start attach_detach = { "attach_pid": process_workload.pid, diff --git a/projects/rocprofiler-sdk/CHANGELOG.md b/projects/rocprofiler-sdk/CHANGELOG.md index 59a41f593b..c128e1ed80 100644 --- a/projects/rocprofiler-sdk/CHANGELOG.md +++ b/projects/rocprofiler-sdk/CHANGELOG.md @@ -242,6 +242,19 @@ Full documentation for ROCprofiler-SDK is available at [rocm.docs.amd.com/projec ## ROCprofiler-SDK 1.1.0 for ROCm release 7.2 ### Added -- Strix halo support for counter collection. +- Counter collection support for `gfx1150` and `gfx1151` (Strix Halo). +- HSA Extension API v8 support. +- `hipStreamCopyAttributes` API implementation. +### Optimized +- Improved process attachment and updated the corresponding [documentation](https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/how-to/using-rocprofv3-process-attachment.html). +- Improved [Quick reference guide for rocprofv3] (https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/quick_guide.html). +- Updated installation documentation with links to the latest repository (https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/install/installation.html). + +### Resolved issues +- Fixed multi-GPU dimension mismatch. +- Fixed device lock issue for dispatch counters. +- Addressed OpenMP Tools task scheduling null pointer exception. +- Fixed stream ID errors arising during process attachment. +- Fixed issues arising during dynamic code object loading. diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/enum_string.hpp b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/enum_string.hpp index 514d56e632..519f081021 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/enum_string.hpp +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/cxx/enum_string.hpp @@ -1004,6 +1004,11 @@ ROCPROFILER_ENUM_LABEL(ROCPROFILER_HIP_RUNTIME_API_ID_hipGetProcAddress_spt) #if HIP_RUNTIME_API_TABLE_STEP_VERSION >= 20 ROCPROFILER_ENUM_LABEL(ROCPROFILER_HIP_RUNTIME_API_ID_hipKernelGetParamInfo) #endif +#if HIP_RUNTIME_API_TABLE_STEP_VERSION >= 21 +ROCPROFILER_ENUM_LABEL(ROCPROFILER_HIP_RUNTIME_API_ID_hipExtDisableLogging) +ROCPROFILER_ENUM_LABEL(ROCPROFILER_HIP_RUNTIME_API_ID_hipExtEnableLogging) +ROCPROFILER_ENUM_LABEL(ROCPROFILER_HIP_RUNTIME_API_ID_hipExtSetLoggingParams) +#endif #if HIP_RUNTIME_API_TABLE_STEP_VERSION == 0 static_assert(ROCPROFILER_HIP_RUNTIME_API_ID_LAST == 442); #elif HIP_RUNTIME_API_TABLE_STEP_VERSION == 1 @@ -1046,6 +1051,8 @@ static_assert(ROCPROFILER_HIP_RUNTIME_API_ID_LAST == 506); static_assert(ROCPROFILER_HIP_RUNTIME_API_ID_LAST == 507); #elif HIP_RUNTIME_API_TABLE_STEP_VERSION == 20 static_assert(ROCPROFILER_HIP_RUNTIME_API_ID_LAST == 508); +#elif HIP_RUNTIME_API_TABLE_STEP_VERSION == 21 +static_assert(ROCPROFILER_HIP_RUNTIME_API_ID_LAST == 511); #else # if !defined(ROCPROFILER_UNSAFE_NO_VERSION_CHECK) && \ (defined(ROCPROFILER_CI) && ROCPROFILER_CI > 0) diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/hip/api_args.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/hip/api_args.h index 17634ce0c8..1e5608b137 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/hip/api_args.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/hip/api_args.h @@ -3377,6 +3377,26 @@ typedef union rocprofiler_hip_api_args_t size_t* paramSize; } hipKernelGetParamInfo; #endif +#if HIP_RUNTIME_API_TABLE_STEP_VERSION >= 21 + struct + { + // Empty struct has a size of 0 in C but size of 1 in C++. + // Add the rocprofiler_hip_api_no_args struct to fix this + rocprofiler_hip_api_no_args no_args; + } hipExtDisableLogging; + struct + { + // Empty struct has a size of 0 in C but size of 1 in C++. + // Add the rocprofiler_hip_api_no_args struct to fix this + rocprofiler_hip_api_no_args no_args; + } hipExtEnableLogging; + struct + { + size_t log_level; + size_t log_size; + size_t log_mask; + } hipExtSetLoggingParams; +#endif } rocprofiler_hip_api_args_t; ROCPROFILER_EXTERN_C_FINI diff --git a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/hip/runtime_api_id.h b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/hip/runtime_api_id.h index 4c2475ddaf..b72d4f2cab 100644 --- a/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/hip/runtime_api_id.h +++ b/projects/rocprofiler-sdk/source/include/rocprofiler-sdk/hip/runtime_api_id.h @@ -575,6 +575,11 @@ typedef enum rocprofiler_hip_runtime_api_id_t // NOLINT(performance-enum-size) #endif #if HIP_RUNTIME_API_TABLE_STEP_VERSION >= 20 ROCPROFILER_HIP_RUNTIME_API_ID_hipKernelGetParamInfo, +#endif +#if HIP_RUNTIME_API_TABLE_STEP_VERSION >= 21 + ROCPROFILER_HIP_RUNTIME_API_ID_hipExtDisableLogging, + ROCPROFILER_HIP_RUNTIME_API_ID_hipExtEnableLogging, + ROCPROFILER_HIP_RUNTIME_API_ID_hipExtSetLoggingParams, #endif ROCPROFILER_HIP_RUNTIME_API_ID_LAST, } rocprofiler_hip_runtime_api_id_t; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hip/abi.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hip/abi.cpp index 5fa4222ad1..ac149de1d4 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hip/abi.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hip/abi.cpp @@ -623,6 +623,12 @@ ROCP_SDK_ENFORCE_ABI(::HipDispatchTable, hipGetProcAddress_spt_fn, 506); ROCP_SDK_ENFORCE_ABI(::HipDispatchTable, hipKernelGetParamInfo_fn, 507); #endif +#if HIP_RUNTIME_API_TABLE_STEP_VERSION >= 21 +ROCP_SDK_ENFORCE_ABI(::HipDispatchTable, hipExtDisableLogging_fn, 508); +ROCP_SDK_ENFORCE_ABI(::HipDispatchTable, hipExtEnableLogging_fn, 509); +ROCP_SDK_ENFORCE_ABI(::HipDispatchTable, hipExtSetLoggingParams_fn, 510); +#endif + #if HIP_RUNTIME_API_TABLE_STEP_VERSION == 0 ROCP_SDK_ENFORCE_ABI_VERSIONING(::HipDispatchTable, 442) #elif HIP_RUNTIME_API_TABLE_STEP_VERSION == 1 @@ -665,6 +671,8 @@ ROCP_SDK_ENFORCE_ABI_VERSIONING(::HipDispatchTable, 506) ROCP_SDK_ENFORCE_ABI_VERSIONING(::HipDispatchTable, 507) #elif HIP_RUNTIME_API_TABLE_STEP_VERSION == 20 ROCP_SDK_ENFORCE_ABI_VERSIONING(::HipDispatchTable, 508) +#elif HIP_RUNTIME_API_TABLE_STEP_VERSION == 21 +ROCP_SDK_ENFORCE_ABI_VERSIONING(::HipDispatchTable, 511) #else INTERNAL_CI_ROCP_SDK_ENFORCE_ABI_VERSIONING(::HipDispatchTable, 0) #endif diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hip/hip.def.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hip/hip.def.cpp index 44cf89d6c1..13e4cd1c76 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hip/hip.def.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/hip/hip.def.cpp @@ -650,6 +650,11 @@ HIP_API_INFO_DEFINITION_V(ROCPROFILER_HIP_TABLE_ID_Runtime, ROCPROFILER_HIP_RUNT HIP_API_INFO_DEFINITION_V(ROCPROFILER_HIP_TABLE_ID_Runtime, ROCPROFILER_HIP_RUNTIME_API_ID_hipKernelGetParamInfo, hipKernelGetParamInfo, hipKernelGetParamInfo_fn, kernel, paramIndex, paramOffset, paramSize); #endif +#if HIP_RUNTIME_API_TABLE_STEP_VERSION >= 21 +HIP_API_INFO_DEFINITION_0(ROCPROFILER_HIP_TABLE_ID_Runtime, ROCPROFILER_HIP_RUNTIME_API_ID_hipExtDisableLogging, hipExtDisableLogging, hipExtDisableLogging_fn); +HIP_API_INFO_DEFINITION_0(ROCPROFILER_HIP_TABLE_ID_Runtime, ROCPROFILER_HIP_RUNTIME_API_ID_hipExtEnableLogging, hipExtEnableLogging, hipExtEnableLogging_fn); +HIP_API_INFO_DEFINITION_V(ROCPROFILER_HIP_TABLE_ID_Runtime, ROCPROFILER_HIP_RUNTIME_API_ID_hipExtSetLoggingParams, hipExtSetLoggingParams, hipExtSetLoggingParams_fn, log_level, log_size, log_mask); +#endif // clang-format on #else diff --git a/projects/rocprofiler-systems/examples/causal-helpers.cmake b/projects/rocprofiler-systems/examples/causal-helpers.cmake index 7a472f4058..5a66833565 100644 --- a/projects/rocprofiler-systems/examples/causal-helpers.cmake +++ b/projects/rocprofiler-systems/examples/causal-helpers.cmake @@ -1,3 +1,6 @@ +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + # # function for # @@ -142,11 +145,27 @@ function(rocprofiler_systems_causal_example_executable _NAME) endif() if(ROCPROFSYS_INSTALL_EXAMPLES) - install( - TARGETS ${_NAME} ${_NAME}-rocprofsys ${_NAME}-coz - DESTINATION bin - COMPONENT rocprofiler-systems-examples - OPTIONAL + set(_TARGETS + ${_NAME} + ${_NAME}-rocprofsys + ${_NAME}-ndebug + ${_NAME}-rocprofsys-ndebug + ${_NAME}-coz ) + set(_EXISTING_TARGETS) + + foreach(_TARGET IN LISTS _TARGETS) + if(TARGET ${_TARGET}) + list(APPEND _EXISTING_TARGETS ${_TARGET}) + endif() + endforeach() + + if(_EXISTING_TARGETS) + install( + TARGETS ${_EXISTING_TARGETS} + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples + COMPONENT rocprofiler-systems-examples + ) + endif() endif() endfunction() diff --git a/projects/rocprofiler-systems/examples/code-coverage/CMakeLists.txt b/projects/rocprofiler-systems/examples/code-coverage/CMakeLists.txt index 60d474bd23..c25ef91312 100644 --- a/projects/rocprofiler-systems/examples/code-coverage/CMakeLists.txt +++ b/projects/rocprofiler-systems/examples/code-coverage/CMakeLists.txt @@ -1,3 +1,6 @@ +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + cmake_minimum_required(VERSION 3.21 FATAL_ERROR) project(rocprofiler-systems-code-coverage-example LANGUAGES CXX) @@ -22,7 +25,11 @@ target_link_libraries(code-coverage PRIVATE Threads::Threads) target_compile_options(code-coverage PRIVATE ${_FLAGS}) if(ROCPROFSYS_INSTALL_EXAMPLES) - install(TARGETS code-coverage DESTINATION bin COMPONENT rocprofiler-systems-examples) + install( + TARGETS code-coverage + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples + COMPONENT rocprofiler-systems-examples + ) endif() set(PYTHON_FILES code-coverage.py) @@ -42,7 +49,7 @@ if(Python3_FOUND) if(ROCPROFSYS_INSTALL_EXAMPLES) install( PROGRAMS ${PROJECT_BINARY_DIR}/${_FILE} - DESTINATION bin + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples COMPONENT rocprofiler-systems-examples ) endif() diff --git a/projects/rocprofiler-systems/examples/fork/CMakeLists.txt b/projects/rocprofiler-systems/examples/fork/CMakeLists.txt index 5a54900e61..af72c34ecc 100644 --- a/projects/rocprofiler-systems/examples/fork/CMakeLists.txt +++ b/projects/rocprofiler-systems/examples/fork/CMakeLists.txt @@ -1,3 +1,6 @@ +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + cmake_minimum_required(VERSION 3.21 FATAL_ERROR) project(rocprofiler-systems-fork LANGUAGES CXX) @@ -18,7 +21,13 @@ target_link_libraries( target_compile_options(fork-example PRIVATE ${_FLAGS}) if(ROCPROFSYS_INSTALL_EXAMPLES) - install(TARGETS fork-example DESTINATION bin COMPONENT rocprofiler-systems-examples) + if(TARGET fork-example) + install( + TARGETS fork-example + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples + COMPONENT rocprofiler-systems-examples + ) + endif() endif() # HIP fork example (multi-process concurrency test) @@ -81,10 +90,10 @@ if(HIPCC_EXECUTABLE) rocprofiler_systems_custom_compilation(COMPILER ${HIPCC_EXECUTABLE} TARGET hipMallocConcurrencyMproc) endif() - if(ROCPROFSYS_INSTALL_EXAMPLES) + if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET hipMallocConcurrencyMproc) install( TARGETS hipMallocConcurrencyMproc - DESTINATION bin + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples COMPONENT rocprofiler-systems-examples ) endif() diff --git a/projects/rocprofiler-systems/examples/jpegdecode/CMakeLists.txt b/projects/rocprofiler-systems/examples/jpegdecode/CMakeLists.txt index ba5bc9fb6b..992d0272d2 100644 --- a/projects/rocprofiler-systems/examples/jpegdecode/CMakeLists.txt +++ b/projects/rocprofiler-systems/examples/jpegdecode/CMakeLists.txt @@ -1,25 +1,5 @@ -################################################################################ -# Copyright (c) 2024 - 2025 Advanced Micro Devices, Inc. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -################################################################################ +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT cmake_minimum_required(VERSION 3.21 FATAL_ERROR) @@ -157,11 +137,15 @@ if(HIP_FOUND AND rocjpeg_FOUND AND Threads_FOUND AND rocprofiler-register_FOUND) target_compile_options(jpegdecode PRIVATE ${_FLAGS}) copy_image_files_and_make_copies() - if(ROCPROFSYS_INSTALL_EXAMPLES) - install(TARGETS jpegdecode DESTINATION bin COMPONENT rocprofiler-systems-examples) + if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET jpegdecode) install( - FILES ${CMAKE_BINARY_DIR}/images - DESTINATION share/rocprofiler-systems/tests/images + TARGETS jpegdecode + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples + COMPONENT rocprofiler-systems-examples + ) + install( + DIRECTORY ${CMAKE_BINARY_DIR}/images/ + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples/images COMPONENT rocprofiler-systems-examples ) endif() diff --git a/projects/rocprofiler-systems/examples/lulesh/CMakeLists.txt b/projects/rocprofiler-systems/examples/lulesh/CMakeLists.txt index 9fd7935d3e..62dc292e2a 100644 --- a/projects/rocprofiler-systems/examples/lulesh/CMakeLists.txt +++ b/projects/rocprofiler-systems/examples/lulesh/CMakeLists.txt @@ -1,3 +1,6 @@ +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + cmake_minimum_required(VERSION 3.21 FATAL_ERROR) project(rocprofiler-systems-lulesh-example LANGUAGES C CXX) @@ -82,16 +85,11 @@ rocprofiler_systems_causal_example_executable( INCLUDE_DIRECTORIES ${PROJECT_SOURCE_DIR}/includes ) -if(ROCPROFSYS_INSTALL_EXAMPLES) - if(LULESH_BUILD_KOKKOS) - install( - TARGETS kokkoscore kokkoscontainers - DESTINATION ${CMAKE_INSTALL_LIBDIR} +if(ROCPROFSYS_INSTALL_EXAMPLES AND LULESH_BUILD_KOKKOS) + install( + TARGETS kokkoscore kokkoscontainers kokkossimd + LIBRARY + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples/lib COMPONENT rocprofiler-systems-examples - ) - set_target_properties( - lulesh - PROPERTIES INSTALL_RPATH "\$ORIGIN/../${CMAKE_INSTALL_LIBDIR}" - ) - endif() + ) endif() diff --git a/projects/rocprofiler-systems/examples/mpi/CMakeLists.txt b/projects/rocprofiler-systems/examples/mpi/CMakeLists.txt index 2ff4043a3c..80a4a70c47 100644 --- a/projects/rocprofiler-systems/examples/mpi/CMakeLists.txt +++ b/projects/rocprofiler-systems/examples/mpi/CMakeLists.txt @@ -1,3 +1,6 @@ +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + cmake_minimum_required(VERSION 3.21 FATAL_ERROR) project(rocprofiler-systems-mpi-examples LANGUAGES C CXX) @@ -77,16 +80,24 @@ add_executable(mpi-example mpi.cpp) target_link_libraries(mpi-example PRIVATE mpi-cxx-interface-library) if(ROCPROFSYS_INSTALL_EXAMPLES) - install( - TARGETS - mpi-example - mpi-allgather - mpi-bcast - mpi-all2all - mpi-reduce - mpi-scatter-gather - mpi-send-recv - DESTINATION bin - COMPONENT rocprofiler-systems-examples + set(MPI_EXAMPLES + mpi-example + mpi-allgather + mpi-bcast + mpi-all2all + mpi-reduce + mpi-scatter-gather + mpi-send-recv + mpi-allreduce ) + + foreach(MPI_EXAMPLE IN LISTS MPI_EXAMPLES) + if(TARGET ${MPI_EXAMPLE}) + install( + TARGETS ${MPI_EXAMPLE} + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples + COMPONENT rocprofiler-systems-examples + ) + endif() + endforeach() endif() diff --git a/projects/rocprofiler-systems/examples/openmp/CMakeLists.txt b/projects/rocprofiler-systems/examples/openmp/CMakeLists.txt index 9e0302b73b..079a10516d 100644 --- a/projects/rocprofiler-systems/examples/openmp/CMakeLists.txt +++ b/projects/rocprofiler-systems/examples/openmp/CMakeLists.txt @@ -89,13 +89,17 @@ endif() target_link_libraries(openmp-cg PRIVATE openmp-common) target_link_libraries(openmp-lu PRIVATE openmp-common) -if(ROCPROFSYS_INSTALL_EXAMPLES) - install( - TARGETS openmp-cg openmp-lu - DESTINATION bin - COMPONENT rocprofiler-systems-examples - ) -endif() +set(OPENMP_EXAMPLES openmp-cg openmp-lu) + +foreach(OPENMP_EXAMPLE IN LISTS OPENMP_EXAMPLES) + if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET ${OPENMP_EXAMPLE}) + install( + TARGETS ${OPENMP_EXAMPLE} + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples + COMPONENT rocprofiler-systems-examples + ) + endif() +endforeach() set(DEFAULT_GPU_TARGETS "gfx900" diff --git a/projects/rocprofiler-systems/examples/openmp/external/CMakeLists.txt b/projects/rocprofiler-systems/examples/openmp/external/CMakeLists.txt index 5c184187a8..bc502d2e42 100644 --- a/projects/rocprofiler-systems/examples/openmp/external/CMakeLists.txt +++ b/projects/rocprofiler-systems/examples/openmp/external/CMakeLists.txt @@ -280,3 +280,19 @@ set(ROCPROFSYS_OMPVV_OFFLOAD_TESTS rocprofiler_systems_message(STATUS "Successfully configured OMPVV" ) + +if(ROCPROFSYS_INSTALL_EXAMPLES) + foreach( + test_target + IN + LISTS ROCPROFSYS_OMPVV_HOST_TESTS ROCPROFSYS_OMPVV_OFFLOAD_TESTS + ) + if(TARGET "${test_target}-build") + install( + PROGRAMS "${OMPVV_BIN_DEST}/${test_target}" + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples + COMPONENT rocprofiler-systems-examples + ) + endif() + endforeach() +endif() diff --git a/projects/rocprofiler-systems/examples/openmp/target/CMakeLists.txt b/projects/rocprofiler-systems/examples/openmp/target/CMakeLists.txt index af4a70ca4b..d9a3e05857 100644 --- a/projects/rocprofiler-systems/examples/openmp/target/CMakeLists.txt +++ b/projects/rocprofiler-systems/examples/openmp/target/CMakeLists.txt @@ -71,7 +71,7 @@ endif() # Use the directory that actually contains the library we found get_filename_component(_rocm_llvm_lib "${LIBOMPTARGET_SO}" DIRECTORY) set(_rocm_clang_lib "${ROCM_ROOT_DIR}/lib") -set(_COMMON_RPATH "${_rocm_llvm_lib};${_rocm_clang_lib};$ORIGIN") +set(_COMMON_RPATH "${_rocm_llvm_lib};${_rocm_clang_lib};$ORIGIN;$ORIGIN/lib") if(ROCmVersion_DIR) list(APPEND _COMMON_RPATH "${ROCmVersion_DIR}/llvm/lib") endif() @@ -126,3 +126,15 @@ rocprofiler_systems_custom_compilation( rocprofiler_systems_custom_compilation(TARGET openmp-target COMPILER ${OMP_TARGET_COMPILER} ) + +if(ROCPROFSYS_INSTALL_EXAMPLES) + if(TARGET openmp-target AND TARGET openmp-target-lib) + install( + TARGETS openmp-target openmp-target-lib + RUNTIME DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples + LIBRARY + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples/lib + COMPONENT rocprofiler-systems-examples + ) + endif() +endif() diff --git a/projects/rocprofiler-systems/examples/parallel-overhead/CMakeLists.txt b/projects/rocprofiler-systems/examples/parallel-overhead/CMakeLists.txt index 8737781185..452923551b 100644 --- a/projects/rocprofiler-systems/examples/parallel-overhead/CMakeLists.txt +++ b/projects/rocprofiler-systems/examples/parallel-overhead/CMakeLists.txt @@ -1,3 +1,6 @@ +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + cmake_minimum_required(VERSION 3.21 FATAL_ERROR) project(rocprofiler-systems-parallel-overhead-example LANGUAGES CXX) @@ -32,10 +35,13 @@ target_link_libraries( ) target_compile_definitions(parallel-overhead-locks PRIVATE USE_LOCKS=1) -if(ROCPROFSYS_INSTALL_EXAMPLES) - install( - TARGETS parallel-overhead parallel-overhead-locks - DESTINATION bin - COMPONENT rocprofiler-systems-examples - ) -endif() +set(PARALLEL_OVERHEAD_EXAMPLES parallel-overhead parallel-overhead-locks) +foreach(PARALLEL_OVERHEAD_EXAMPLE IN LISTS PARALLEL_OVERHEAD_EXAMPLES) + if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET ${PARALLEL_OVERHEAD_EXAMPLE}) + install( + TARGETS ${PARALLEL_OVERHEAD_EXAMPLE} + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples + COMPONENT rocprofiler-systems-examples + ) + endif() +endforeach() diff --git a/projects/rocprofiler-systems/examples/python/CMakeLists.txt b/projects/rocprofiler-systems/examples/python/CMakeLists.txt index 29b370b3ca..ab50d2f1cf 100644 --- a/projects/rocprofiler-systems/examples/python/CMakeLists.txt +++ b/projects/rocprofiler-systems/examples/python/CMakeLists.txt @@ -1,3 +1,6 @@ +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + cmake_minimum_required(VERSION 3.21 FATAL_ERROR) project(rocprofiler-systems-python) @@ -36,7 +39,7 @@ if(Python3_FOUND) if(ROCPROFSYS_INSTALL_EXAMPLES) install( PROGRAMS ${PROJECT_BINARY_DIR}/${_FILE} - DESTINATION bin + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples COMPONENT rocprofiler-systems-examples ) endif() diff --git a/projects/rocprofiler-systems/examples/rccl/CMakeLists.txt b/projects/rocprofiler-systems/examples/rccl/CMakeLists.txt index 8fb842c1a4..1a23d69dd1 100644 --- a/projects/rocprofiler-systems/examples/rccl/CMakeLists.txt +++ b/projects/rocprofiler-systems/examples/rccl/CMakeLists.txt @@ -138,6 +138,14 @@ if(hip_FOUND AND rccl_FOUND) add_dependencies(rccl-tests::${_EXE_NAME} copy-${_EXE_NAME}) list(APPEND _RCCL_TEST_TARGETS "rccl-tests::${_EXE_NAME}") + + if(ROCPROFSYS_INSTALL_EXAMPLES) + install( + PROGRAMS ${_EXE_DEST_PATH} + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples + COMPONENT rocprofiler-systems-examples + ) + endif() endforeach() set(RCCL_TEST_TARGETS "${_RCCL_TEST_TARGETS}" CACHE INTERNAL "rccl-test targets") diff --git a/projects/rocprofiler-systems/examples/rewrite-caller/CMakeLists.txt b/projects/rocprofiler-systems/examples/rewrite-caller/CMakeLists.txt index aa1b7a0b34..1066d18e2a 100644 --- a/projects/rocprofiler-systems/examples/rewrite-caller/CMakeLists.txt +++ b/projects/rocprofiler-systems/examples/rewrite-caller/CMakeLists.txt @@ -1,3 +1,6 @@ +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + cmake_minimum_required(VERSION 3.21 FATAL_ERROR) project(rocprofiler-systems-rewrite-caller-example LANGUAGES CXX) @@ -18,6 +21,10 @@ set(CMAKE_BUILD_TYPE "Debug") add_executable(rewrite-caller rewrite-caller.cpp) target_compile_options(rewrite-caller PRIVATE ${_FLAGS}) -if(ROCPROFSYS_INSTALL_EXAMPLES) - install(TARGETS rewrite-caller DESTINATION bin COMPONENT rocprofiler-systems-examples) +if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET rewrite-caller) + install( + TARGETS rewrite-caller + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples + COMPONENT rocprofiler-systems-examples + ) endif() diff --git a/projects/rocprofiler-systems/examples/roctx/CMakeLists.txt b/projects/rocprofiler-systems/examples/roctx/CMakeLists.txt index f5f7dd19c6..da58cdf4ac 100644 --- a/projects/rocprofiler-systems/examples/roctx/CMakeLists.txt +++ b/projects/rocprofiler-systems/examples/roctx/CMakeLists.txt @@ -1,24 +1,5 @@ -# MIT License -# -# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT cmake_minimum_required(VERSION 3.21 FATAL_ERROR) @@ -113,6 +94,10 @@ if(NOT CMAKE_CXX_COMPILER_IS_HIPCC AND HIPCC_EXECUTABLE) rocprofiler_systems_custom_compilation(COMPILER ${HIPCC_EXECUTABLE} TARGET roctx) endif() -if(ROCPROFSYS_INSTALL_EXAMPLES) - install(TARGETS roctx DESTINATION bin COMPONENT rocprofiler-systems-examples) +if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET roctx) + install( + TARGETS roctx + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples + COMPONENT rocprofiler-systems-examples + ) endif() diff --git a/projects/rocprofiler-systems/examples/thread-limit/CMakeLists.txt b/projects/rocprofiler-systems/examples/thread-limit/CMakeLists.txt index 5a486228d8..63df5ca529 100644 --- a/projects/rocprofiler-systems/examples/thread-limit/CMakeLists.txt +++ b/projects/rocprofiler-systems/examples/thread-limit/CMakeLists.txt @@ -40,3 +40,11 @@ target_compile_options(tests-compile-options INTERFACE -g) add_executable(thread-limit thread-limit.cpp) target_compile_definitions(thread-limit PRIVATE MAX_THREADS=${ROCPROFSYS_MAX_THREADS}) target_link_libraries(thread-limit PRIVATE Threads::Threads tests-compile-options) + +if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET thread-limit) + install( + TARGETS thread-limit + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples + COMPONENT rocprofiler-systems-examples + ) +endif() diff --git a/projects/rocprofiler-systems/examples/trace-time-window/CMakeLists.txt b/projects/rocprofiler-systems/examples/trace-time-window/CMakeLists.txt index 3a8f396ba2..be61d05dc1 100644 --- a/projects/rocprofiler-systems/examples/trace-time-window/CMakeLists.txt +++ b/projects/rocprofiler-systems/examples/trace-time-window/CMakeLists.txt @@ -1,3 +1,6 @@ +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + cmake_minimum_required(VERSION 3.21 FATAL_ERROR) project(rocprofiler-systems-trace-time-window-example LANGUAGES CXX) @@ -18,10 +21,10 @@ set(CMAKE_BUILD_TYPE "Debug") add_executable(trace-time-window trace-time-window.cpp) target_compile_options(trace-time-window PRIVATE ${_FLAGS}) -if(ROCPROFSYS_INSTALL_EXAMPLES) +if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET trace-time-window) install( TARGETS trace-time-window - DESTINATION bin + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples COMPONENT rocprofiler-systems-examples ) endif() diff --git a/projects/rocprofiler-systems/examples/transferBench/CMakeLists.txt b/projects/rocprofiler-systems/examples/transferBench/CMakeLists.txt index a0af07f3df..5d29d8f15b 100644 --- a/projects/rocprofiler-systems/examples/transferBench/CMakeLists.txt +++ b/projects/rocprofiler-systems/examples/transferBench/CMakeLists.txt @@ -1,3 +1,6 @@ +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + cmake_minimum_required(VERSION 3.21 FATAL_ERROR) project(rocprofiler-systems-transferBench-example LANGUAGES CXX) @@ -120,6 +123,10 @@ if(NOT CMAKE_CXX_COMPILER_IS_HIPCC AND HIPCC_EXECUTABLE) rocprofiler_systems_custom_compilation(COMPILER ${HIPCC_EXECUTABLE} TARGET transferBench) endif() -if(ROCPROFSYS_INSTALL_EXAMPLES) - install(TARGETS transferBench DESTINATION bin COMPONENT rocprofiler-systems-examples) +if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET transferBench) + install( + TARGETS transferBench + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples + COMPONENT rocprofiler-systems-examples + ) endif() diff --git a/projects/rocprofiler-systems/examples/transpose/CMakeLists.txt b/projects/rocprofiler-systems/examples/transpose/CMakeLists.txt index 2406674764..0dcf2fd950 100644 --- a/projects/rocprofiler-systems/examples/transpose/CMakeLists.txt +++ b/projects/rocprofiler-systems/examples/transpose/CMakeLists.txt @@ -1,3 +1,6 @@ +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + cmake_minimum_required(VERSION 3.21 FATAL_ERROR) project(rocprofiler-systems-transpose-example LANGUAGES CXX) @@ -93,6 +96,10 @@ if(NOT CMAKE_CXX_COMPILER_IS_HIPCC AND HIPCC_EXECUTABLE) rocprofiler_systems_custom_compilation(COMPILER ${HIPCC_EXECUTABLE} TARGET transpose) endif() -if(ROCPROFSYS_INSTALL_EXAMPLES) - install(TARGETS transpose DESTINATION bin COMPONENT rocprofiler-systems-examples) +if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET transpose) + install( + TARGETS transpose + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples + COMPONENT rocprofiler-systems-examples + ) endif() diff --git a/projects/rocprofiler-systems/examples/user-api/CMakeLists.txt b/projects/rocprofiler-systems/examples/user-api/CMakeLists.txt index cf99ae647c..4fcd7fb3e6 100644 --- a/projects/rocprofiler-systems/examples/user-api/CMakeLists.txt +++ b/projects/rocprofiler-systems/examples/user-api/CMakeLists.txt @@ -1,3 +1,6 @@ +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + cmake_minimum_required(VERSION 3.21 FATAL_ERROR) project(rocprofiler-systems-user-api-example LANGUAGES CXX) @@ -24,6 +27,10 @@ target_link_libraries( PRIVATE Threads::Threads rocprofiler-systems::rocprofiler-systems-user-library ) -if(ROCPROFSYS_INSTALL_EXAMPLES) - install(TARGETS user-api DESTINATION bin COMPONENT rocprofiler-systems-examples) +if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET user-api) + install( + TARGETS user-api + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples + COMPONENT rocprofiler-systems-examples + ) endif() diff --git a/projects/rocprofiler-systems/examples/videodecode/CMakeLists.txt b/projects/rocprofiler-systems/examples/videodecode/CMakeLists.txt index ace4a9a189..1bf1fadeb3 100644 --- a/projects/rocprofiler-systems/examples/videodecode/CMakeLists.txt +++ b/projects/rocprofiler-systems/examples/videodecode/CMakeLists.txt @@ -1,3 +1,6 @@ +# Copyright (c) Advanced Micro Devices, Inc. +# SPDX-License-Identifier: MIT + cmake_minimum_required(VERSION 3.21 FATAL_ERROR) project(rocprofiler-systems-videodecode-example LANGUAGES CXX) @@ -176,15 +179,15 @@ if(FFMPEG_FOUND AND rocdecode_FOUND) target_compile_definitions(videodecode PUBLIC USE_AVCODEC_GREATER_THAN_58_134=1) endif() - if(ROCPROFSYS_INSTALL_EXAMPLES) + if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET videodecode) install( TARGETS videodecode - DESTINATION bin + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples COMPONENT rocprofiler-systems-examples ) install( - FILES ${CMAKE_BINARY_DIR}/videos - DESTINATION share/rocprofiler-systems/tests/videos + DIRECTORY ${CMAKE_BINARY_DIR}/videos/ + DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples/videos COMPONENT rocprofiler-systems-examples ) endif() diff --git a/projects/rocprofiler-systems/source/bin/rocprof-sys-run/impl.cpp b/projects/rocprofiler-systems/source/bin/rocprof-sys-run/impl.cpp index f869ec8ca0..a19121529b 100644 --- a/projects/rocprofiler-systems/source/bin/rocprof-sys-run/impl.cpp +++ b/projects/rocprofiler-systems/source/bin/rocprof-sys-run/impl.cpp @@ -192,6 +192,10 @@ prepare_environment_for_run(parser_data_t& _data) rocprofsys::argparse::add_ld_preload(_data); rocprofsys::argparse::add_ld_library_path(_data); } + + rocprofsys::argparse::add_torch_library_path(_data, _data.verbose > 0); + + rocprofsys::common::consolidate_env_entries(_data.current); } void diff --git a/projects/rocprofiler-systems/source/bin/rocprof-sys-sample/impl.cpp b/projects/rocprofiler-systems/source/bin/rocprof-sys-sample/impl.cpp index 6a4b27edb4..96008b2dc6 100644 --- a/projects/rocprofiler-systems/source/bin/rocprof-sys-sample/impl.cpp +++ b/projects/rocprofiler-systems/source/bin/rocprof-sys-sample/impl.cpp @@ -933,3 +933,9 @@ parse_args(int argc, char** argv, std::vector& _env) return _outv; } + +void +add_torch_library_path(std::vector& envp, const std::vector& argv) +{ + rocprofsys::common::add_torch_library_path(envp, argv, verbose > 0, updated_envs); +} diff --git a/projects/rocprofiler-systems/source/bin/rocprof-sys-sample/rocprof-sys-sample.cpp b/projects/rocprofiler-systems/source/bin/rocprof-sys-sample/rocprof-sys-sample.cpp index 122d02e59d..03098fa03a 100644 --- a/projects/rocprofiler-systems/source/bin/rocprof-sys-sample/rocprof-sys-sample.cpp +++ b/projects/rocprofiler-systems/source/bin/rocprof-sys-sample/rocprof-sys-sample.cpp @@ -51,6 +51,8 @@ main(int argc, char** argv) _argv.emplace_back(argv[i]); } + add_torch_library_path(_env, _argv); + print_updated_environment(_env); if(!_argv.empty()) diff --git a/projects/rocprofiler-systems/source/bin/rocprof-sys-sample/rocprof-sys-sample.hpp b/projects/rocprofiler-systems/source/bin/rocprof-sys-sample/rocprof-sys-sample.hpp index 2134bc0680..8878d92635 100644 --- a/projects/rocprofiler-systems/source/bin/rocprof-sys-sample/rocprof-sys-sample.hpp +++ b/projects/rocprofiler-systems/source/bin/rocprof-sys-sample/rocprof-sys-sample.hpp @@ -35,3 +35,6 @@ get_initial_environment(); std::vector parse_args(int argc, char** argv, std::vector& envp); + +void +add_torch_library_path(std::vector& envp, const std::vector& argv); diff --git a/projects/rocprofiler-systems/source/lib/common/environment.hpp b/projects/rocprofiler-systems/source/lib/common/environment.hpp index 7d2f45eceb..1ff89d355a 100644 --- a/projects/rocprofiler-systems/source/lib/common/environment.hpp +++ b/projects/rocprofiler-systems/source/lib/common/environment.hpp @@ -26,9 +26,12 @@ #include "common/join.hpp" #include +#include #include #include #include +#include +#include #include #include #include @@ -197,7 +200,7 @@ remove_env(std::vector& _environ, std::string_view _env_var, { if(match(itr)) { - free(itr); + std::free(itr); itr = nullptr; } } @@ -266,6 +269,113 @@ discover_llvm_libdir_for_ompt(bool verbose = false) return {}; } +inline bool +is_python_interpreter(std::string_view executable) +{ + if(executable.empty()) return false; + + const auto slash_pos = executable.rfind('/'); + const auto basename = (slash_pos != std::string_view::npos) + ? executable.substr(slash_pos + 1) + : executable; + + if(basename == "python" || basename == "python3") return true; + + constexpr std::string_view python3_prefix = "python3."; + + const bool has_valid_prefix = + basename.size() > python3_prefix.size() && + basename.substr(0, python3_prefix.size()) == python3_prefix; + if(!has_valid_prefix) return false; + + const auto version_digits = basename.substr(python3_prefix.size()); + + return std::all_of(version_digits.begin(), version_digits.end(), + [](unsigned char c) { return std::isdigit(c); }); +} + +inline std::string +discover_torch_libpath(const std::string& python_binary, bool verbose = false) +{ + if(python_binary.empty()) return {}; + + const auto is_safe_executable_path = [](const std::string& path) { + // Allow only a conservative set of characters in the executable path to + // avoid injection when used in a shell command. + for(unsigned char c : path) + { + if(std::isalnum(c) != 0) continue; + switch(c) + { + case '/': + case '.': + case '_': + case '-': + case '+': break; + default: return false; + } + } + return true; + }; + + if(!is_safe_executable_path(python_binary)) + { + ROCPROFSYS_ENVIRON_LOG( + verbose, "Unsafe characters detected in Python interpreter path: %s\n", + python_binary.c_str()); + return {}; + } + + const auto cmd = "\"" + python_binary + + "\" -c \"import torch; print(torch.__path__[0])\" 2>/dev/null"; + + FILE* pipe = popen(cmd.c_str(), "r"); + if(!pipe) + { + ROCPROFSYS_ENVIRON_LOG(verbose, "Failed to execute command: %s\n", cmd.c_str()); + return {}; + } + + char buffer[1024]; + std::string result; + while(fgets(buffer, sizeof(buffer), pipe)) + { + result.append(buffer); + // stop if we've read the full line (torch path is printed on a single line) + if(!result.empty() && result.back() == '\n') break; + } + + int status = pclose(pipe); + + if(status != 0 || result.empty()) + { + ROCPROFSYS_ENVIRON_LOG(verbose, "torch not found for Python interpreter: %s\n", + python_binary.c_str()); + return {}; + } + + while(!result.empty() && + (result.back() == '\n' || result.back() == '\r' || result.back() == ' ')) + { + result.pop_back(); + } + + if(result.empty()) return {}; + + std::string torch_libdir = result + "/lib"; + + if(!::tim::filepath::direxists(torch_libdir)) + { + ROCPROFSYS_ENVIRON_LOG(verbose, "torch lib directory does not exist: %s\n", + torch_libdir.c_str()); + return {}; + } + + ROCPROFSYS_ENVIRON_LOG(verbose, "Discovered torch library path: %s\n", + torch_libdir.c_str()); + return torch_libdir; +} + enum class update_mode : uint8_t { REPLACE = 0, @@ -335,7 +445,7 @@ update_env(std::vector& _environ, std::string_view _env_var, Tp&& _env_va } else { - free(itr); + std::free(itr); itr = strdup(join('=', _env_var, _env_val_str).c_str()); } return; @@ -343,5 +453,145 @@ update_env(std::vector& _environ, std::string_view _env_var, Tp&& _env_va _environ.emplace_back(strdup(join('=', _env_var, _env_val_str).c_str())); } +template +inline void +add_torch_library_path(std::vector& envp, const std::vector& argv, + bool verbose, UpdatedEnvsT& updated_envs) +{ + if(argv.empty() || argv.front() == nullptr) return; + if(!is_python_interpreter(argv.front())) return; + + auto torch_libpath = discover_torch_libpath(argv.front(), verbose); + if(torch_libpath.empty()) return; + + std::unordered_set seen{ torch_libpath }; + std::string result = torch_libpath; + + constexpr std::string_view ld_prefix = "LD_LIBRARY_PATH="; + + auto is_ld_path = [&](char* entry) { + return entry && + std::string_view{ entry }.substr(0, ld_prefix.length()) == ld_prefix; + }; + + for(auto& entry : envp) + { + if(!is_ld_path(entry)) continue; + + std::istringstream stream{ std::string{ entry + ld_prefix.length() } }; + for(std::string path; std::getline(stream, path, ':');) + { + if(!path.empty() && seen.insert(path).second) result += ":" + path; + } + + std::free(entry); + entry = nullptr; + } + + envp.erase(std::remove(envp.begin(), envp.end(), nullptr), envp.end()); + envp.emplace_back(strdup(join("", ld_prefix, result).c_str())); + + updated_envs.emplace(ld_prefix.substr(0, ld_prefix.length() - 1)); +} + +inline void +consolidate_env_entries(std::vector& envp) +{ + constexpr char delim = ':'; + + struct key_data + { + std::vector parts; + std::unordered_set seen; + + void add_unique(std::string part) + { + if(!part.empty() && seen.insert(part).second) + parts.emplace_back(std::move(part)); + } + }; + + auto parse_entry = [](std::string_view entry) + -> std::optional> { + auto eq_pos = entry.find('='); + if(eq_pos == std::string_view::npos) return std::nullopt; + return std::make_pair(entry.substr(0, eq_pos), entry.substr(eq_pos + 1)); + }; + + auto join_parts = [delim](std::string_view key, + const std::vector& parts) { + std::string result; + + const auto total_parts_length = std::accumulate( + parts.begin(), parts.end(), std::size_t{ 0 }, + [](std::size_t acc, const std::string& part) { return acc + part.size(); }); + + const auto delim_count = parts.size() - 1; + const auto equal_sign_length = 1; + + result.reserve(key.size() + equal_sign_length + total_parts_length + delim_count); + result.append(key); + result += '='; + + result = + std::accumulate(parts.begin(), parts.end(), std::move(result), + [delim, &parts](std::string acc, const std::string& part) { + if(part != parts.front()) acc += delim; + acc.append(part); + return acc; + }); + + return result; + }; + + std::unordered_map key_map; + std::vector key_order; + + for(auto* entry : envp) + { + if(!entry) + { + continue; + } + + auto parsed = parse_entry(entry); + if(!parsed) + { + continue; + } + + auto [key, value] = *parsed; + + auto [it, inserted] = key_map.try_emplace(key); + if(inserted) + { + key_order.emplace_back(key); + } + + auto& data = it->second; + std::istringstream stream{ std::string{ value } }; + for(std::string part; std::getline(stream, part, delim);) + { + data.add_unique(part); + } + } + + std::vector result; + result.reserve(key_order.size()); + + for(auto key : key_order) + { + result.emplace_back(strdup(join_parts(key, key_map[key].parts).c_str())); + } + + for(auto* entry : envp) + { + std::free(entry); + entry = nullptr; + } + + envp = std::move(result); +} + } // namespace common } // namespace rocprofsys diff --git a/projects/rocprofiler-systems/source/lib/common/tests/CMakeLists.txt b/projects/rocprofiler-systems/source/lib/common/tests/CMakeLists.txt index 79d3f4ad03..4e2391824c 100644 --- a/projects/rocprofiler-systems/source/lib/common/tests/CMakeLists.txt +++ b/projects/rocprofiler-systems/source/lib/common/tests/CMakeLists.txt @@ -24,6 +24,7 @@ add_library( lib-common-tests OBJECT test_discover_llvm_libdir.cpp + test_environment.cpp test_path.cpp test_remove_env.cpp test_update_env.cpp diff --git a/projects/rocprofiler-systems/source/lib/common/tests/test_environment.cpp b/projects/rocprofiler-systems/source/lib/common/tests/test_environment.cpp new file mode 100644 index 0000000000..7c33603aa1 --- /dev/null +++ b/projects/rocprofiler-systems/source/lib/common/tests/test_environment.cpp @@ -0,0 +1,146 @@ +// Copyright (c) Advanced Micro Devices, Inc. +// SPDX-License-Identifier: MIT + +#include "common/environment.hpp" + +#include + +using namespace rocprofsys::common; + +class IsPythonInterpreterTest : public ::testing::Test +{}; + +TEST_F(IsPythonInterpreterTest, RecognizesPython) +{ + EXPECT_TRUE(is_python_interpreter("python")); + EXPECT_TRUE(is_python_interpreter("python3")); + EXPECT_TRUE(is_python_interpreter("python3.8")); + EXPECT_TRUE(is_python_interpreter("python3.9")); + EXPECT_TRUE(is_python_interpreter("python3.10")); + EXPECT_TRUE(is_python_interpreter("python3.11")); + EXPECT_TRUE(is_python_interpreter("python3.12")); + EXPECT_TRUE(is_python_interpreter("/usr/bin/python")); + EXPECT_TRUE(is_python_interpreter("/usr/bin/python3")); + EXPECT_TRUE(is_python_interpreter("/usr/bin/python3.10")); + EXPECT_TRUE(is_python_interpreter("/home/user/venv/bin/python")); + EXPECT_TRUE(is_python_interpreter("/opt/conda/bin/python3.11")); + EXPECT_FALSE(is_python_interpreter("bash")); + EXPECT_FALSE(is_python_interpreter("sh")); + EXPECT_FALSE(is_python_interpreter("ruby")); + EXPECT_FALSE(is_python_interpreter("node")); + EXPECT_FALSE(is_python_interpreter("java")); + EXPECT_FALSE(is_python_interpreter("/usr/bin/bash")); + EXPECT_FALSE(is_python_interpreter("./my_app")); + EXPECT_FALSE(is_python_interpreter("pythonista")); + EXPECT_FALSE(is_python_interpreter("python_script.py")); + EXPECT_FALSE(is_python_interpreter("mypython")); + EXPECT_FALSE(is_python_interpreter("python2")); + EXPECT_FALSE(is_python_interpreter("python3.")); + EXPECT_FALSE(is_python_interpreter("python3.a")); + EXPECT_FALSE(is_python_interpreter("python3.10a")); + EXPECT_FALSE(is_python_interpreter("python3x10")); + EXPECT_FALSE(is_python_interpreter("")); + EXPECT_FALSE(is_python_interpreter("/usr/bin/")); +} + +class DuplicatedEnvironmentEntriesTest : public ::testing::Test +{}; + +TEST_F(DuplicatedEnvironmentEntriesTest, DuplicateEnvironmentEntries) +{ + std::vector env_vars = { + strdup("PATH=/usr/local/bin:/usr/bin:/bin:/usr/local/bin2"), + strdup("PATH=/usr/local/bin:/usr/bin:/bin"), + }; + + consolidate_env_entries(env_vars); + + ASSERT_EQ(env_vars.size(), 1); + EXPECT_STREQ(env_vars[0], "PATH=/usr/local/bin:/usr/bin:/bin:/usr/local/bin2"); + + for(auto* entry : env_vars) + free(entry); +} + +TEST_F(DuplicatedEnvironmentEntriesTest, HandlesEmptyVector) +{ + std::vector env_vars; + consolidate_env_entries(env_vars); + EXPECT_TRUE(env_vars.empty()); +} + +TEST_F(DuplicatedEnvironmentEntriesTest, HandlesNullEntries) +{ + std::vector env_vars = { + strdup("PATH=/usr/bin"), + nullptr, + strdup("PATH=/bin"), + }; + consolidate_env_entries(env_vars); + ASSERT_EQ(env_vars.size(), 1); + EXPECT_STREQ(env_vars[0], "PATH=/usr/bin:/bin"); + for(auto* entry : env_vars) + std::free(entry); +} + +TEST_F(DuplicatedEnvironmentEntriesTest, HandlesEmptyValues) +{ + std::vector env_vars = { + strdup("EMPTY_VAR="), + strdup("PATH=/usr/bin"), + }; + consolidate_env_entries(env_vars); + ASSERT_EQ(env_vars.size(), 2); + + for(auto* entry : env_vars) + std::free(entry); +} + +class AddTorchLibraryPathTest : public ::testing::Test +{ +protected: + std::unordered_set updated_envs; +}; + +TEST_F(AddTorchLibraryPathTest, SkipsNonPythonExecutables) +{ + std::vector envp = { + strdup("LD_LIBRARY_PATH=/usr/lib"), + }; + std::vector argv = { + strdup("/usr/bin/bash"), + }; + add_torch_library_path(envp, argv, false, updated_envs); + // Should not modify environment + ASSERT_EQ(envp.size(), 1); + EXPECT_STREQ(envp[0], "LD_LIBRARY_PATH=/usr/lib"); + for(auto* entry : envp) + std::free(entry); + for(auto* entry : argv) + std::free(entry); +} + +TEST_F(AddTorchLibraryPathTest, HandlesEmptyArgv) +{ + std::vector envp = { + strdup("LD_LIBRARY_PATH=/usr/lib"), + }; + std::vector argv; + add_torch_library_path(envp, argv, false, updated_envs); + ASSERT_EQ(envp.size(), 1); + EXPECT_STREQ(envp[0], "LD_LIBRARY_PATH=/usr/lib"); + for(auto* entry : envp) + std::free(entry); +} + +TEST_F(AddTorchLibraryPathTest, HandlesNullArgvFront) +{ + std::vector envp = { + strdup("LD_LIBRARY_PATH=/usr/lib"), + }; + std::vector argv = { nullptr }; + add_torch_library_path(envp, argv, false, updated_envs); + ASSERT_EQ(envp.size(), 1); + for(auto* entry : envp) + std::free(entry); +} diff --git a/projects/rocprofiler-systems/source/lib/core/argparse.cpp b/projects/rocprofiler-systems/source/lib/core/argparse.cpp index c4230ff807..b4ea4c41d1 100644 --- a/projects/rocprofiler-systems/source/lib/core/argparse.cpp +++ b/projects/rocprofiler-systems/source/lib/core/argparse.cpp @@ -168,6 +168,14 @@ add_ld_library_path(parser_data& _data) return _data; } +parser_data& +add_torch_library_path(parser_data& _data, bool verbose) +{ + rocprofsys::common::add_torch_library_path(_data.current, _data.command, verbose, + _data.updated); + return _data; +} + parser_data& add_core_arguments(parser_t& _parser, parser_data& _data) { diff --git a/projects/rocprofiler-systems/source/lib/core/argparse.hpp b/projects/rocprofiler-systems/source/lib/core/argparse.hpp index 9280ddd4f9..744c097b0e 100644 --- a/projects/rocprofiler-systems/source/lib/core/argparse.hpp +++ b/projects/rocprofiler-systems/source/lib/core/argparse.hpp @@ -83,6 +83,9 @@ add_ld_preload(parser_data&); parser_data& add_ld_library_path(parser_data&); +parser_data& +add_torch_library_path(parser_data&, bool verbose = false); + parser_data& add_core_arguments(parser_t&, parser_data&); diff --git a/projects/rocprofiler-systems/source/lib/core/config.cpp b/projects/rocprofiler-systems/source/lib/core/config.cpp index 192875063e..b4a1cbe6e0 100644 --- a/projects/rocprofiler-systems/source/lib/core/config.cpp +++ b/projects/rocprofiler-systems/source/lib/core/config.cpp @@ -316,7 +316,9 @@ configure_settings(bool _init) "backend", "perfetto"); ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_TRACE_LEGACY", - "Use legacy direct mode for perfetto tracing instead of " + "[DEPRECATED] The new default option is to use data from " + "cached buffer. When set to true system will use " + "legacy direct mode for perfetto tracing instead of " "deferred trace generation. When false (default), uses " "cached mode with minimal runtime overhead.", false, "backend", "perfetto"); @@ -1088,6 +1090,9 @@ configure_settings(bool _init) handle_deprecated_setting("ROCPROFSYS_OUTPUT_FILE", "ROCPROFSYS_PERFETTO_FILE"); handle_deprecated_setting("ROCPROFSYS_USE_PERFETTO", "ROCPROFSYS_TRACE"); handle_deprecated_setting("ROCPROFSYS_USE_TIMEMORY", "ROCPROFSYS_PROFILE"); + handle_deprecated_setting("ROCPROFSYS_DEBUG", "ROCPROFSYS_LOG_LEVEL"); + handle_deprecated_setting("ROCPROFSYS_VERBOSE", "ROCPROFSYS_LOG_LEVEL"); + handle_deprecated_setting("ROCPROFSYS_TRACE_LEGACY", "ROCPROFSYS_TRACE"); scope::get_fields()[scope::flat::value] = _config->get_flat_profile(); scope::get_fields()[scope::timeline::value] = _config->get_timeline_profile(); diff --git a/projects/rocprofiler-systems/source/lib/core/gpu.cpp b/projects/rocprofiler-systems/source/lib/core/gpu.cpp index fdfd70348d..08cab7fb55 100644 --- a/projects/rocprofiler-systems/source/lib/core/gpu.cpp +++ b/projects/rocprofiler-systems/source/lib/core/gpu.cpp @@ -31,6 +31,8 @@ } \ } // namespace ::tim::cereal +#include "common/defines.h" + #if !defined(ROCPROFSYS_USE_ROCM) # define ROCPROFSYS_USE_ROCM 0 #endif @@ -40,7 +42,6 @@ #include -#include #include #include "core/agent_manager.hpp" @@ -92,17 +93,6 @@ _amdsmi_is_initialized() return initialized; } -void -prevent_amdsmi_library_unload() -{ - static bool _initialized = false; - if(_initialized) return; - _initialized = true; - - dlopen("libamd_smi.so", RTLD_NOW | RTLD_NOLOAD | RTLD_NODELETE); - dlopen("librocm_smi64.so", RTLD_NOW | RTLD_NOLOAD | RTLD_NODELETE); -} - bool amdsmi_init() { @@ -113,8 +103,6 @@ amdsmi_init() ROCPROFSYS_AMD_SMI_CALL(::amdsmi_init(AMDSMI_INIT_AMD_GPUS)); get_processor_handles(); _amdsmi_is_initialized() = true; // Mark as initialized - - prevent_amdsmi_library_unload(); } catch(std::exception& _e) { LOG_ERROR("Exception thrown initializing amd-smi: {}", _e.what()); diff --git a/projects/rocprofiler-systems/source/lib/core/rocprofiler-sdk.cpp b/projects/rocprofiler-systems/source/lib/core/rocprofiler-sdk.cpp index 52b9f2607e..a0df719742 100644 --- a/projects/rocprofiler-systems/source/lib/core/rocprofiler-sdk.cpp +++ b/projects/rocprofiler-systems/source/lib/core/rocprofiler-sdk.cpp @@ -367,7 +367,6 @@ config_settings(const std::shared_ptr& _config) _skip_domains.emplace("kernel_dispatch"); _skip_domains.emplace("page_migration"); - _skip_domains.emplace("scratch_memory"); _add_operation_settings( "MARKER_API", callback_tracing_info[ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API], @@ -652,7 +651,7 @@ get_backtrace_operations(rocprofiler_callback_tracing_kind_t kindv) { if(callback_operation_option_names.count(kindv) == 0) { - LOG_CRITICAL("callback_operation_operation_names does not have value for {}", + LOG_CRITICAL("callback_operation_option_names does not have value for {}", static_cast(kindv)); ::rocprofsys::set_state(::rocprofsys::State::Finalized); std::abort(); diff --git a/projects/rocprofiler-systems/source/lib/core/trace_cache/cache_manager.hpp b/projects/rocprofiler-systems/source/lib/core/trace_cache/cache_manager.hpp index 47e8cd2a5a..98c1a2880c 100644 --- a/projects/rocprofiler-systems/source/lib/core/trace_cache/cache_manager.hpp +++ b/projects/rocprofiler-systems/source/lib/core/trace_cache/cache_manager.hpp @@ -41,7 +41,7 @@ using storage_parser_t = storage_parser; + backtrace_region_sample, scratch_memory_sample>; using buffer_storage_t = buffer_storage; diff --git a/projects/rocprofiler-systems/source/lib/core/trace_cache/perfetto_processor.cpp b/projects/rocprofiler-systems/source/lib/core/trace_cache/perfetto_processor.cpp index 92a25255e5..6303824aa8 100644 --- a/projects/rocprofiler-systems/source/lib/core/trace_cache/perfetto_processor.cpp +++ b/projects/rocprofiler-systems/source/lib/core/trace_cache/perfetto_processor.cpp @@ -540,6 +540,72 @@ perfetto_processor_t::handle([[maybe_unused]] const kernel_dispatch_sample& _kds #endif } +void +perfetto_processor_t::handle([[maybe_unused]] const scratch_memory_sample& _sms) +{ +#if ROCPROFSYS_USE_ROCM > 0 + auto _corr_id = _sms.correlation_id_internal; + auto _stream_id = _sms.stream_handle; + auto _queue_id_handle = _sms.queue_id_handle; + const auto& _t_info = thread_info::get(_sms.thread_id, SystemTID); + const auto _thread_id_sequent = _t_info->index_data->sequent_value; + auto _beg_ts = _sms.start_timestamp; + auto _end_ts = _sms.end_timestamp; + + auto _agent_device_id = + m_agent_manager.get_agent_by_handle(_sms.agent_id_handle).device_type_index; + auto _name = std::string{ m_metadata.get_buffer_name_info().at( + static_cast(_sms.kind), + static_cast(_sms.operation)) }; + +// Scratch memory samples from SDK versions prior to 7.0.2 do not include +// allocation_size field, so counter tracks are not needed +# if ROCPROFSYS_ROCM_VERSION >= 70002 + using counter_track = + perfetto_counter_track; + + if(!counter_track::exists(_agent_device_id)) + { + auto _track_desc_alloc_size = JOIN("", "GPU Scratch Memory [", _agent_device_id, + "] Thread ", _thread_id_sequent); + counter_track::emplace(_agent_device_id, _track_desc_alloc_size, "bytes"); + } + + if(_sms.operation == ROCPROFILER_SCRATCH_MEMORY_ALLOC) + { + TRACE_COUNTER("rocm_scratch_memory", counter_track::at(_agent_device_id, 0), + _beg_ts, _sms.allocation_size); + } +# endif + + auto _track_desc_events = [&]() { + return JOIN("", "GPU Scratch Memory Events Thread ", _thread_id_sequent); + }; + + const auto _track = + tracing::get_perfetto_track(category::rocm_scratch_memory{}, _track_desc_events); + + auto add_perfetto_annotations = [&](::perfetto::EventContext ctx) { + if(!m_use_annotations) return; + + annotate_perfetto(ctx, { { "begin_ns", _beg_ts }, + { "end_ns", _end_ts }, + { "corr_id", _corr_id }, + { "stream_id", _stream_id }, + { "queue", _queue_id_handle }, + { "allocation_size", _sms.allocation_size }, + { "agent_id", _agent_device_id }, + { "operation", _name }, + { "flags", _sms.flags } }); + }; + + tracing::push_perfetto(category::rocm_scratch_memory{}, _name.c_str(), _track, + _beg_ts, ::perfetto::Flow::ProcessScoped(_corr_id), + add_perfetto_annotations); + tracing::pop_perfetto(category::rocm_scratch_memory{}, "", _track, _end_ts); +#endif +} + void perfetto_processor_t::handle([[maybe_unused]] const memory_copy_sample& _mcs) { diff --git a/projects/rocprofiler-systems/source/lib/core/trace_cache/perfetto_processor.hpp b/projects/rocprofiler-systems/source/lib/core/trace_cache/perfetto_processor.hpp index 5ddb2410b6..4dd487a657 100644 --- a/projects/rocprofiler-systems/source/lib/core/trace_cache/perfetto_processor.hpp +++ b/projects/rocprofiler-systems/source/lib/core/trace_cache/perfetto_processor.hpp @@ -56,6 +56,7 @@ public: void finalize_processing(); void handle(const kernel_dispatch_sample& sample); + void handle(const scratch_memory_sample& sample); void handle(const memory_copy_sample& sample); void handle(const memory_allocate_sample& sample); void handle(const region_sample& sample); diff --git a/projects/rocprofiler-systems/source/lib/core/trace_cache/rocpd_processor.cpp b/projects/rocprofiler-systems/source/lib/core/trace_cache/rocpd_processor.cpp index 567b1de230..cded11c3a8 100644 --- a/projects/rocprofiler-systems/source/lib/core/trace_cache/rocpd_processor.cpp +++ b/projects/rocprofiler-systems/source/lib/core/trace_cache/rocpd_processor.cpp @@ -66,6 +66,37 @@ get_handle_from_code_object( # endif } #endif + +#if ROCPROFSYS_USE_ROCM > 0 +using memory_operation = std::string; +using memory_type = std::string; +std::pair +parse_memory_operation_name(std::string_view memory_operation_name) +{ + static const std::unordered_map> + parsing_map{ + { "MEMORY_ALLOCATION_NONE", { "NONE", "REAL" } }, + { "MEMORY_ALLOCATION_ALLOCATE", { "ALLOC", "REAL" } }, + { "MEMORY_ALLOCATION_VMEM_ALLOCATE", { "ALLOC", "VIRTUAL" } }, + { "MEMORY_ALLOCATION_FREE", { "FREE", "REAL" } }, + { "MEMORY_ALLOCATION_VMEM_FREE", { "FREE", "VIRTUAL" } }, + { "SCRATCH_MEMORY_NONE", { "NONE", "SCRATCH" } }, + { "SCRATCH_MEMORY_ALLOC", { "ALLOC", "SCRATCH" } }, + { "SCRATCH_MEMORY_FREE", { "FREE", "SCRATCH" } }, + { "SCRATCH_MEMORY_ASYNC_RECLAIM", { "ASYNC_RECLAIM", "SCRATCH" } }, + }; + + auto item = parsing_map.find(memory_operation_name); + if(item == parsing_map.end()) + { + LOG_WARNING("Unknown memory operation name: {}", memory_operation_name); + return { "UNKNOWN", "UNKNOWN" }; + } + + return item->second; +} +#endif } // namespace void @@ -110,6 +141,46 @@ rocpd_processor_t::handle([[maybe_unused]] const kernel_dispatch_sample& _kds) #endif } +void +rocpd_processor_t::handle([[maybe_unused]] const scratch_memory_sample& _sms) +{ +#if ROCPROFSYS_USE_ROCM > 0 + auto& n_info = node_info::get_instance(); + auto process = m_metadata->get_process_info(); + + const auto* _name = m_metadata->get_buffer_name_info().at( + static_cast(_sms.kind), + static_cast(_sms.operation)); + + auto agent_primary_key = + m_agent_manager->get_agent_by_handle(_sms.agent_id_handle).base_id; + + auto thread_primary_key = + m_data_processor->map_thread_id_to_primary_key(_sms.thread_id); + + auto category_primary_key = m_data_processor->insert_string( + trait::name::value); + + auto stack_id = _sms.correlation_id_internal; + auto parent_stack_id = _sms.correlation_id_ancestor; + auto correlation_id = 0; + auto address_value = 0; + + auto event_primary_key = m_data_processor->insert_event( + category_primary_key, stack_id, parent_stack_id, correlation_id); + + auto [memory_operation, memory_type] = parse_memory_operation_name(_name); + + auto extdata_json_str = JOIN("", "{\"flags\": ", _sms.flags, "}"); + + m_data_processor->insert_memory_alloc( + n_info.id, process.pid, thread_primary_key, agent_primary_key, + memory_operation.c_str(), memory_type.c_str(), _sms.start_timestamp, + _sms.end_timestamp, address_value, _sms.allocation_size, _sms.queue_id_handle, + _sms.stream_handle, event_primary_key, extdata_json_str.c_str()); +#endif +} + void rocpd_processor_t::handle([[maybe_unused]] const memory_copy_sample& _mcs) { @@ -153,46 +224,6 @@ void rocpd_processor_t::handle([[maybe_unused]] const memory_allocate_sample& _mas) { #if ROCPROFSYS_USE_ROCM > 0 && (ROCPROFILER_VERSION >= 600) - static auto memtype_to_db = - [](std::string_view memory_type) -> std::pair { - constexpr auto MEMORY_PREFIX = std::string_view{ "MEMORY_ALLOCATION_" }; - constexpr auto SCRATCH_PREFIX = std::string_view{ "SCRATCH_MEMORY_" }; - constexpr auto VMEM_PREFIX = std::string_view{ "VMEM_" }; - constexpr auto ASYNC_PREFIX = std::string_view{ "ASYNC_" }; - - std::string _type; - std::string _level; - if(memory_type.find(MEMORY_PREFIX) == 0) - { - _type = memory_type.substr(MEMORY_PREFIX.length()); - if(_type.find(VMEM_PREFIX) == 0) - { - _type = _type.substr(VMEM_PREFIX.length()); - _level = "VIRTUAL"; - } - else - { - _level = "REAL"; - } - } - else if(memory_type.find(SCRATCH_PREFIX) == 0) - { - _type = memory_type.substr(SCRATCH_PREFIX.length()); - _level = "SCRATCH"; - if(memory_type.find(ASYNC_PREFIX) == 0) - { - _type = memory_type.substr(ASYNC_PREFIX.length()); // RECLAIM - } - } - - if(_type == "ALLOCATE") - { - _type = "ALLOC"; - } - - return std::make_pair(_type, _level); - }; - auto& n_info = node_info::get_instance(); auto process = m_metadata->get_process_info(); auto thread_primary_key = @@ -210,7 +241,7 @@ rocpd_processor_t::handle([[maybe_unused]] const memory_allocate_sample& _mas) static_cast(_mas.kind), static_cast(_mas.operation)); - auto [type, level] = memtype_to_db(_name); + auto [memory_operation, memory_type] = parse_memory_operation_name(_name); auto stack_id = _mas.correlation_id_internal; auto parent_stack_id = _mas.correlation_id_ancestor; @@ -224,9 +255,10 @@ rocpd_processor_t::handle([[maybe_unused]] const memory_allocate_sample& _mas) category_primary_key, stack_id, parent_stack_id, correlation_id); m_data_processor->insert_memory_alloc( - n_info.id, process.pid, thread_primary_key, agent_primary_key, type.c_str(), - level.c_str(), _mas.start_timestamp, _mas.end_timestamp, _mas.address_value, - _mas.allocation_size, queue_id, _mas.stream_handle, event_primary_key); + n_info.id, process.pid, thread_primary_key, agent_primary_key, + memory_operation.c_str(), memory_type.c_str(), _mas.start_timestamp, + _mas.end_timestamp, _mas.address_value, _mas.allocation_size, queue_id, + _mas.stream_handle, event_primary_key); } #endif } diff --git a/projects/rocprofiler-systems/source/lib/core/trace_cache/rocpd_processor.hpp b/projects/rocprofiler-systems/source/lib/core/trace_cache/rocpd_processor.hpp index 68f299a3e6..78c1d25670 100644 --- a/projects/rocprofiler-systems/source/lib/core/trace_cache/rocpd_processor.hpp +++ b/projects/rocprofiler-systems/source/lib/core/trace_cache/rocpd_processor.hpp @@ -45,6 +45,7 @@ public: void finalize_processing(); void handle(const kernel_dispatch_sample& sample); + void handle(const scratch_memory_sample& sample); void handle(const memory_copy_sample& sample); void handle(const memory_allocate_sample& sample); void handle(const region_sample& sample); diff --git a/projects/rocprofiler-systems/source/lib/core/trace_cache/sample_processor.hpp b/projects/rocprofiler-systems/source/lib/core/trace_cache/sample_processor.hpp index b55d80740b..b94232ef16 100644 --- a/projects/rocprofiler-systems/source/lib/core/trace_cache/sample_processor.hpp +++ b/projects/rocprofiler-systems/source/lib/core/trace_cache/sample_processor.hpp @@ -43,6 +43,11 @@ struct processor_t static_cast(this)->handle(sample); } + void handle(const scratch_memory_sample& sample) + { + static_cast(this)->handle(sample); + } + void handle(const memory_copy_sample& sample) { static_cast(this)->handle(sample); @@ -84,6 +89,7 @@ protected: struct processor_view_t { using kernel_dispatch_fn_t = void (*)(void*, const kernel_dispatch_sample&) noexcept; + using scratch_memory_fn_t = void (*)(void*, const scratch_memory_sample&) noexcept; using memory_copy_fn_t = void (*)(void*, const memory_copy_sample&) noexcept; #if(ROCPROFILER_VERSION >= 600) using memory_allocate_fn_t = void (*)(void*, const memory_allocate_sample&) noexcept; @@ -101,6 +107,7 @@ struct processor_view_t struct vtable_t { kernel_dispatch_fn_t handle_kernel_dispatch; + scratch_memory_fn_t handle_scratch_memory; memory_copy_fn_t handle_memory_copy; #if(ROCPROFILER_VERSION >= 600) memory_allocate_fn_t handle_memory_allocate; @@ -134,6 +141,11 @@ struct processor_view_t m_vtable->handle_kernel_dispatch(m_object, sample); } + ROCPROFSYS_INLINE void handle(const scratch_memory_sample& sample) const noexcept + { + m_vtable->handle_scratch_memory(m_object, sample); + } + ROCPROFSYS_INLINE void handle(const memory_copy_sample& sample) const noexcept { m_vtable->handle_memory_copy(m_object, sample); @@ -194,6 +206,9 @@ private: +[](void* obj, const kernel_dispatch_sample& sample) noexcept { static_cast(obj)->handle(sample); }, + +[](void* obj, const scratch_memory_sample& sample) noexcept { + static_cast(obj)->handle(sample); + }, +[](void* obj, const memory_copy_sample& sample) noexcept { static_cast(obj)->handle(sample); }, @@ -275,6 +290,9 @@ struct sample_processor_t case type_identifier_t::kernel_dispatch: handle_sample(static_cast(sample)); break; + case type_identifier_t::scratch_memory: + handle_sample(static_cast(sample)); + break; case type_identifier_t::memory_copy: handle_sample(static_cast(sample)); break; diff --git a/projects/rocprofiler-systems/source/lib/core/trace_cache/sample_type.hpp b/projects/rocprofiler-systems/source/lib/core/trace_cache/sample_type.hpp index 9bd20bfc8c..d1119a44e4 100644 --- a/projects/rocprofiler-systems/source/lib/core/trace_cache/sample_type.hpp +++ b/projects/rocprofiler-systems/source/lib/core/trace_cache/sample_type.hpp @@ -46,6 +46,7 @@ enum class type_identifier_t : uint32_t amd_smi_sample = 0x0006, cpu_freq_sample = 0x0007, backtrace_region_sample = 0x0008, + scratch_memory = 0x0009, fragmented_space = 0xFFFF }; @@ -148,6 +149,83 @@ get_size(const kernel_dispatch_sample& item) item.grid_size_z, static_cast(item.stream_handle)); } +struct scratch_memory_sample : cacheable_t +{ + static constexpr type_identifier_t type_identifier = + type_identifier_t::scratch_memory; + + scratch_memory_sample() = default; + scratch_memory_sample(uint64_t _start_timestamp, uint64_t _end_timestamp, + uint64_t _thread_id, uint64_t _agent_id_handle, + uint64_t _queue_id_handle, int32_t _kind, int32_t _operation, + int32_t _flags, uint64_t _allocation_size, + uint64_t _correlation_id_internal, + uint64_t _correlation_id_ancestor, size_t _stream_handle) + : start_timestamp(_start_timestamp) + , end_timestamp(_end_timestamp) + , thread_id(_thread_id) + , agent_id_handle(_agent_id_handle) + , queue_id_handle(_queue_id_handle) + , kind(_kind) + , operation(_operation) + , flags(_flags) + , allocation_size(_allocation_size) + , correlation_id_internal(_correlation_id_internal) + , correlation_id_ancestor(_correlation_id_ancestor) + , stream_handle(_stream_handle) + {} + + uint64_t start_timestamp; + uint64_t end_timestamp; + uint64_t thread_id; + uint64_t agent_id_handle; + uint64_t queue_id_handle; + int32_t kind; + int32_t operation; + int32_t flags; + uint64_t allocation_size; + uint64_t correlation_id_internal; + uint64_t correlation_id_ancestor; + size_t stream_handle; +}; + +template <> +inline void +serialize(uint8_t* buffer, const scratch_memory_sample& item) +{ + utility::store_value(buffer, item.start_timestamp, item.end_timestamp, item.thread_id, + item.agent_id_handle, item.queue_id_handle, item.kind, + item.operation, item.flags, item.allocation_size, + item.correlation_id_internal, item.correlation_id_ancestor, + static_cast(item.stream_handle)); +} + +template <> +inline scratch_memory_sample +deserialize(uint8_t*& buffer) +{ + scratch_memory_sample item; + uint64_t stream_handle; + utility::parse_value(buffer, item.start_timestamp, item.end_timestamp, item.thread_id, + item.agent_id_handle, item.queue_id_handle, item.kind, + item.operation, item.flags, item.allocation_size, + item.correlation_id_internal, item.correlation_id_ancestor, + stream_handle); + item.stream_handle = stream_handle; + return item; +} + +template <> +inline size_t +get_size(const scratch_memory_sample& item) +{ + return utility::get_size(item.start_timestamp, item.end_timestamp, item.thread_id, + item.agent_id_handle, item.queue_id_handle, item.kind, + item.operation, item.flags, item.allocation_size, + item.correlation_id_internal, item.correlation_id_ancestor, + static_cast(item.stream_handle)); +} + struct memory_copy_sample : cacheable_t { static constexpr type_identifier_t type_identifier = type_identifier_t::memory_copy; diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/amd_smi.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/amd_smi.cpp index 136487ecd0..da38dfe77b 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/amd_smi.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/amd_smi.cpp @@ -1272,7 +1272,10 @@ shutdown() try { - data::shutdown(); + if(data::shutdown()) + { + ROCPROFSYS_AMD_SMI_CALL(amdsmi_shut_down()); + } } catch(std::runtime_error& _e) { LOG_WARNING("Exception thrown when shutting down amd-smi: {}", _e.what()); diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/exit_gotcha.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/exit_gotcha.cpp index d1dd92a664..ce8654f206 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/exit_gotcha.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/exit_gotcha.cpp @@ -24,6 +24,7 @@ #include "core/common.hpp" #include "core/config.hpp" #include "core/state.hpp" +#include "core/timemory.hpp" #include "library/runtime.hpp" #include @@ -33,9 +34,7 @@ #include "logger/debug.hpp" #include -#include #include -#include namespace rocprofsys { @@ -89,26 +88,6 @@ void exit_gotcha::operator()(const gotcha_data& _data, exit_func_t _func, int _ec) const { _exit_info = { true, _data.tool_id.find("quick") != std::string::npos, _ec }; - - if(config::get_use_amd_smi()) - { - threading::clear_callbacks(); - - if(get_state() < ::rocprofsys::State::Finalized && !is_child_process()) - { - LOG_DEBUG("Finalizing {} before calling {}({})...", get_exe_name(), - _data.tool_id, _ec); - - rocprofsys_finalize(); - } - - LOG_DEBUG("Calling _exit({}) in {} to avoid AMD SMI cleanup issues...", _ec, - get_exe_name().c_str()); - - std::fflush(nullptr); - _exit(_ec); - } - invoke_exit_gotcha(_data, _func, _ec); } diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp index d30368650b..0a337b2400 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk.cpp @@ -567,6 +567,18 @@ get_mem_alloc_address( } #endif +uint64_t +get_scratch_mem_alloc_size( + [[maybe_unused]] const rocprofiler_buffer_tracing_scratch_memory_record_t& record) +{ +// Scratch memory samples from SDK versions prior to 7.0.2 do not include allocation_size +#if(ROCPROFSYS_USE_ROCM > 0 && ROCPROFSYS_ROCM_VERSION >= 70002) + return record.allocation_size; +#else + return 0; +#endif +} + void cache_region(const rocprofiler_callback_tracing_record_t* record, const rocprofiler_timestamp_t start_timestamp, @@ -615,13 +627,26 @@ cache_kernel_dispatch(rocprofiler_buffer_tracing_kernel_dispatch_record_t* recor record->dispatch_info.grid_size.z, stream_handle }); } +void +cache_scratch_memory(rocprofiler_buffer_tracing_scratch_memory_record_t* record, + uint64_t stream_handle) +{ + trace_cache::get_metadata_registry().add_stream(stream_handle); + trace_cache::get_buffer_storage().store(trace_cache::scratch_memory_sample{ + record->start_timestamp, record->end_timestamp, record->thread_id, + record->agent_id.handle, record->queue_id.handle, + static_cast(record->kind), static_cast(record->operation), + static_cast(record->flags), get_scratch_mem_alloc_size(*record), + record->correlation_id.internal, get_parent_stack_id(record->correlation_id), + stream_handle }); +} + void cache_memory_copy(rocprofiler_buffer_tracing_memory_copy_record_t* record, uint64_t stream_handle) { trace_cache::get_metadata_registry().add_stream(stream_handle); trace_cache::get_buffer_storage().store(trace_cache::memory_copy_sample{ - record->start_timestamp, record->end_timestamp, record->thread_id, record->dst_agent_id.handle, record->src_agent_id.handle, static_cast(record->kind), static_cast(record->operation), @@ -1759,6 +1784,120 @@ tool_tracing_buffered(rocprofiler_context_id_t /*context*/, } } } + else if(header->kind == ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY) + { + auto* record = + static_cast( + header->payload); + + bool _group_by_queue = _default_group_by_queue; + + const auto* agent = tool_data->get_gpu_tool_agent(record->agent_id); + auto device_id = static_cast(agent->device_id); + + const auto& t_info = thread_info::get(record->thread_id, SystemTID); + auto thread_id_sequent = t_info->index_data->sequent_value; + + auto _corr_id = record->correlation_id.internal; + auto _beg_ns = record->start_timestamp; + auto _end_ns = record->end_timestamp; + auto _name = + tool_data->buffered_tracing_info.at(record->kind, record->operation); + + auto _stream_id = get_stream_id(record).handle; + if(_stream_id == 0) + { + // Scratch memory event is not associated with a HIP stream + _group_by_queue = true; + } + + { + auto track_name = JOIN("", "GPU Scratch Memory [", device_id, + "] Thread ", record->thread_id); + cache_category(); + cache_add_thread_info(record->thread_id); + cache_add_track(track_name.c_str(), record->thread_id); + cache_scratch_memory(record, _stream_id); + } + + if(get_use_timemory()) + { + auto _bundle = kernel_dispatch_bundle_t{ _name }; + + _bundle.push(thread_id_sequent).start().stop(); + _bundle.get([_beg_ns, _end_ns](tim::component::wall_clock* _wc) { + _wc->set_value(_end_ns - _beg_ns); + _wc->set_accum(_end_ns - _beg_ns); + }); + _bundle.pop(); + } + + if(get_use_perfetto()) + { +// Scratch memory samples from SDK versions prior to 7.0.2 do not include +// allocation_size field, so counter tracks are not needed +#if(ROCPROFSYS_USE_ROCM > 0 && ROCPROFSYS_ROCM_VERSION >= 70002) + using counter_track = perfetto_counter_track< + rocprofiler_buffer_tracing_scratch_memory_record_t>; + + if(!counter_track::exists(device_id)) + { + auto track_name_alloc_size = + JOIN("", "GPU Scratch Memory [", device_id, "] (S) Thread ", + thread_id_sequent); + counter_track::emplace(device_id, track_name_alloc_size, "bytes"); + } + + if(record->operation == ROCPROFILER_SCRATCH_MEMORY_ALLOC) + { + TRACE_COUNTER("rocm_scratch_memory", + counter_track::at(device_id, 0), _beg_ns, + record->allocation_size); + } +#endif + auto add_perfetto_annotations = [&](::perfetto::EventContext ctx) { + if(config::get_perfetto_annotations()) + { + tracing::add_perfetto_annotation(ctx, "begin_ns", _beg_ns); + tracing::add_perfetto_annotation(ctx, "end_ns", _end_ns); + tracing::add_perfetto_annotation(ctx, "corr_id", _corr_id); + tracing::add_perfetto_annotation(ctx, "stream_id", + _stream_id); + } + }; + + if(_group_by_queue) + { + auto track_name_events = [&]() { + return JOIN("", "GPU Scratch Memory (S) Events Thread ", + thread_id_sequent); + }; + const auto _track = tracing::get_perfetto_track( + category::rocm_scratch_memory{}, track_name_events); + + tracing::push_perfetto(category::rocm_scratch_memory{}, + _name.data(), _track, _beg_ns, + ::perfetto::Flow::ProcessScoped(_corr_id), + add_perfetto_annotations); + + tracing::pop_perfetto(category::rocm_scratch_memory{}, "", _track, + _end_ns); + } + else + { + const auto _track = tracing::get_perfetto_track( + category::rocm_hip_stream{}, _track_desc_stream, _stream_id); + + tracing::push_perfetto(category::rocm_hip_stream{}, _name.data(), + _track, _beg_ns, + ::perfetto::Flow::ProcessScoped(_corr_id), + add_perfetto_annotations); + + tracing::pop_perfetto(category::rocm_hip_stream{}, "", _track, + _end_ns); + } + } + } else if(header->kind == ROCPROFILER_BUFFER_TRACING_MEMORY_COPY) { auto* record = @@ -2249,6 +2388,17 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* user_data) _data->primary_ctx, ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, nullptr, 0, _data->memory_copy_buffer)); } + if(_buffered_domain.count(ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY) > 0) + { + ROCPROFILER_CALL(rocprofiler_create_buffer( + _data->primary_ctx, buffer_size, watermark, + ROCPROFILER_BUFFER_POLICY_LOSSLESS, tool_tracing_buffered, tool_data, + &_data->scratch_memory_buffer)); + + ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service( + _data->primary_ctx, ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY, nullptr, 0, + _data->scratch_memory_buffer)); + } #if(ROCPROFILER_VERSION >= 600) if(_buffered_domain.count(ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION) > 0) diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/fwd.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/fwd.hpp index fa75982db4..48d4714044 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/fwd.hpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/rocprofiler-sdk/fwd.hpp @@ -122,7 +122,7 @@ using backtrace_operation_map_t = struct client_data { - static constexpr size_t num_buffers = 4; + static constexpr size_t num_buffers = 5; static constexpr size_t num_contexts = 2; using buffer_name_info_t = rocprofiler::sdk::buffer_name_info_t; @@ -138,6 +138,7 @@ struct client_data rocprofiler_context_id_t primary_ctx = { 0 }; rocprofiler_context_id_t counter_ctx = { 0 }; rocprofiler_buffer_id_t kernel_dispatch_buffer = { 0 }; + rocprofiler_buffer_id_t scratch_memory_buffer = { 0 }; rocprofiler_buffer_id_t memory_copy_buffer = { 0 }; rocprofiler_buffer_id_t memory_alloc_buffer = { 0 }; rocprofiler_buffer_id_t counter_collection_buffer = { 0 }; @@ -179,12 +180,9 @@ client_data::get_contexts() const inline client_data::buffer_id_vec_t client_data::get_buffers() const { - return buffer_id_vec_t{ - kernel_dispatch_buffer, - memory_copy_buffer, - memory_alloc_buffer, - counter_collection_buffer, - }; + return buffer_id_vec_t{ kernel_dispatch_buffer, scratch_memory_buffer, + memory_copy_buffer, memory_alloc_buffer, + counter_collection_buffer }; } inline const rocprofsys_agent_t* diff --git a/projects/rocprofiler-systems/source/python/gui/source/gui.py b/projects/rocprofiler-systems/source/python/gui/source/gui.py index 115e717d5e..6e6233901c 100644 --- a/projects/rocprofiler-systems/source/python/gui/source/gui.py +++ b/projects/rocprofiler-systems/source/python/gui/source/gui.py @@ -47,7 +47,6 @@ from .parser import parse_uploaded_file from .parser import find_causal_files import plotly.graph_objects as go - file_timestamp = 0 global_data = pd.DataFrame() global_samples = pd.DataFrame() diff --git a/projects/rocprofiler-systems/source/python/rocprofsys/common.py b/projects/rocprofiler-systems/source/python/rocprofsys/common.py index 6f4b06e54b..f257e5e8f7 100644 --- a/projects/rocprofiler-systems/source/python/rocprofsys/common.py +++ b/projects/rocprofiler-systems/source/python/rocprofsys/common.py @@ -37,7 +37,6 @@ from . import libpyrocprofsys from .libpyrocprofsys.profiler import profiler_init as _profiler_init from .libpyrocprofsys.profiler import profiler_finalize as _profiler_fini - __all__ = ["exec_", "_file", "_get_argv", "_initialize", "_finalize"] diff --git a/projects/rocprofiler-systems/source/python/rocprofsys/user.py b/projects/rocprofiler-systems/source/python/rocprofsys/user.py index 9e51c9e638..b0871c8071 100644 --- a/projects/rocprofiler-systems/source/python/rocprofsys/user.py +++ b/projects/rocprofiler-systems/source/python/rocprofsys/user.py @@ -44,7 +44,6 @@ from .libpyrocprofsys.user import pop_region from .common import _initialize from .common import _file - __all__ = [ "region", "Region", diff --git a/projects/rocprofiler-systems/tests/validate-causal-json.py b/projects/rocprofiler-systems/tests/validate-causal-json.py index e10578c7f3..c3360d02ac 100755 --- a/projects/rocprofiler-systems/tests/validate-causal-json.py +++ b/projects/rocprofiler-systems/tests/validate-causal-json.py @@ -30,7 +30,6 @@ import math import argparse from collections import OrderedDict - num_stddev = 1.0 diff --git a/projects/rocprofiler-systems/tests/validate-rocpd.py b/projects/rocprofiler-systems/tests/validate-rocpd.py index 43f3241760..4714c282ea 100644 --- a/projects/rocprofiler-systems/tests/validate-rocpd.py +++ b/projects/rocprofiler-systems/tests/validate-rocpd.py @@ -75,8 +75,7 @@ class required_table: def print_help(): """Print out the help message""" - print( - f""" + print(f""" ROCPD Database Validation Tool DESCRIPTION: @@ -115,8 +114,7 @@ def print_help(): 64 - Invalid command line arguments (EX_USAGE) 65 - Validation failures detected (EX_DATAERR) 1 - General error (database connection, file not found, etc.) - """ - ) + """) def validate_table(cursor, rule, tables) -> bool: diff --git a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp index d23f889637..4fbf5aa145 100644 --- a/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp +++ b/projects/rocr-runtime/runtime/hsa-runtime/core/runtime/runtime.cpp @@ -3666,32 +3666,30 @@ hsa_status_t Runtime::VMemoryHandleMap(void* va, size_t size, size_t in_offset, // Create handle by exporting and importing the memory from the owning agent auto &agent_driver = agent->driver(); + ShareableHandle shareable_handle; +#if defined(__linux__) hsa_status_t status = agent_driver.ExportDMABuf(memoryHandleIt->first, size, &dmabuf_fd, &offset); if (status != HSA_STATUS_SUCCESS) return status; assert(offset == 0); - ShareableHandle shareable_handle; status = agent_driver.ImportDMABuf(dmabuf_fd, *agent, shareable_handle); if (status != HSA_STATUS_SUCCESS) return status; - if (dmabuf_fd != -1) { - close(dmabuf_fd); - } + close(dmabuf_fd); // Get address that memory is mapped to - if (shareable_handle.IsValid()) { - ret = GetAmdgpuDeviceArgs(agent, shareable_handle, &drm_fd, &drm_cpu_addr); - if (ret) return HSA_STATUS_ERROR; - } else { - hsa_status_t status = agent_driver.GetShareableHandle(va, memoryHandleIt->first, size, &shareable_handle); - if (status != HSA_STATUS_SUCCESS) { - return status; - } - drm_cpu_addr = reinterpret_cast(va); + ret = GetAmdgpuDeviceArgs(agent, shareable_handle, &drm_fd, &drm_cpu_addr); + if (ret) return HSA_STATUS_ERROR; +#else + hsa_status_t status = agent_driver.GetShareableHandle(va, memoryHandleIt->first, size, &shareable_handle); + if (status != HSA_STATUS_SUCCESS) { + return status; } + drm_cpu_addr = reinterpret_cast(va); +#endif mapped_handle_map_.emplace( std::piecewise_construct, std::forward_as_tuple(va), @@ -3783,6 +3781,7 @@ Runtime::MappedHandleAllowedAgent::MappedHandleAllowedAgent( uint64_t offset = 0; MemoryHandle *memHandle = mappedHandle->mem_handle; +#if defined(__linux__) // Export memory from owner agent. hsa_status_t status = memHandle->agentOwner()->driver().ExportDMABuf( memHandle->thunk_handle, mappedHandle->size, &dmabuf_fd, &offset); @@ -3798,6 +3797,9 @@ Runtime::MappedHandleAllowedAgent::MappedHandleAllowedAgent( close(dmabuf_fd); if (status != HSA_STATUS_SUCCESS) return; +#else + shareable_handle.handle = _mappedHandle->shareable_handle.handle; +#endif } Runtime::MappedHandleAllowedAgent::~MappedHandleAllowedAgent() {