Merge branch 'develop' into amd/dev/gandryey/SWDEV-558849

This commit is contained in:
German Andryeyev
2026-01-20 12:04:53 -05:00
committad av GitHub
förälder 07a6b45535 4a5cbbfba5
incheckning 3af2bf4952
103 ändrade filer med 2367 tillägg och 528 borttagningar
@@ -41,7 +41,7 @@ jobs:
working-directory: projects/rocprofiler-compute working-directory: projects/rocprofiler-compute
run: | run: |
python -m pip install --upgrade pip python -m pip install --upgrade pip
python -m pip install ruff python -m pip install ruff==0.14.11
if [ -f requirements.txt ]; then python -m pip install -r requirements.txt; fi if [ -f requirements.txt ]; then python -m pip install -r requirements.txt; fi
- name: Run Ruff Linter and Import Sorter - name: Run Ruff Linter and Import Sorter
run: | run: |
@@ -226,6 +226,12 @@ class AMDSMICommands():
# Set args.* to passed in arguments # Set args.* to passed in arguments
if gpu: if gpu:
args.gpu = gpu args.gpu = gpu
cpu_attributes = ["cpu"]
for attr in cpu_attributes:
if hasattr(args, 'cpu') and getattr(args, 'cpu'):
print("N/A")
return
# Handle No GPU passed # Handle No GPU passed
if args.gpu == None: if args.gpu == None:
+1
Visa fil
@@ -8,6 +8,7 @@ Full documentation for HIP is available at [rocm.docs.amd.com](https://rocm.docs
* New HIP APIs * New HIP APIs
- `hipKernelGetParamInfo` returns the offset and size of a kernel parameter - `hipKernelGetParamInfo` returns the offset and size of a kernel parameter
* Support for `barrier_arrive` and `barrier_wait` for `grid_group` and `thread_block`.
* New HIP supports * New HIP supports
- `grid_group::block_rank()` returns the rank of the block in the calling thread - `grid_group::block_rank()` returns the rank of the block in the calling thread
@@ -111,9 +111,8 @@
#if !defined(__HIPCC_RTC__) #if !defined(__HIPCC_RTC__)
#include <hip/amd_detail/amd_hip_common.h> #include <hip/amd_detail/amd_hip_common.h>
#include "amd_hip_vector_types.h" // float2 etc #include "amd_hip_vector_types.h" // float2 etc
#include "device_library_decls.h" // ocml conversion functions
#include "math_fwd.h" // ocml device functions
#if defined(__clang__) && defined(__HIP__) #if defined(__clang__) && defined(__HIP__)
#include "math_fwd.h" // ocml device functions
#include <hip/amd_detail/amd_warp_functions.h> // define warpSize #include <hip/amd_detail/amd_warp_functions.h> // define warpSize
#include <hip/amd_detail/amd_warp_sync_functions.h> // Sync functions #include <hip/amd_detail/amd_warp_sync_functions.h> // Sync functions
#endif #endif
@@ -338,7 +337,11 @@ struct __attribute__((aligned(2))) __hip_bfloat16 {
}; };
/**@}*/ /**@}*/
#if defined(__clang__)
typedef __bf16 __bf16_2 __attribute__((ext_vector_type(2))); typedef __bf16 __bf16_2 __attribute__((ext_vector_type(2)));
#else
typedef __bf16 __bf16_2 __attribute__((vector_size(sizeof(__bf16) * 2)));
#endif
/** /**
* \defgroup HIP_INTRINSIC_BFLOAT162_STRUCT * \defgroup HIP_INTRINSIC_BFLOAT162_STRUCT
@@ -350,6 +353,7 @@ struct __attribute__((aligned(4))) __hip_bfloat162 {
static_assert(sizeof(__hip_bfloat16[2]) == sizeof(__bf16_2)); static_assert(sizeof(__hip_bfloat16[2]) == sizeof(__bf16_2));
public: public:
#if defined(__clang__)
union { union {
struct { struct {
__hip_bfloat16 x; /*! \brief raw representation of bfloat16 */ __hip_bfloat16 x; /*! \brief raw representation of bfloat16 */
@@ -357,7 +361,12 @@ struct __attribute__((aligned(4))) __hip_bfloat162 {
}; };
__bf16_2 __xy_bf162; __bf16_2 __xy_bf162;
}; };
#else
/* GCC does not support anonymous structs with members that have non-trivial constructors (Clang
allows this as an extension). Expose x and y directly instead. */
__hip_bfloat16 x;
__hip_bfloat16 y;
#endif
public: public:
/*! \brief create __hip_bfloat162 from __hip_bfloat162_raw */ /*! \brief create __hip_bfloat162 from __hip_bfloat162_raw */
@@ -373,7 +382,11 @@ struct __attribute__((aligned(4))) __hip_bfloat162 {
: x(a), y(b) {} : x(a), y(b) {}
/*! \brief create __hip_bfloat162 from vector of __bf16_2 */ /*! \brief create __hip_bfloat162 from vector of __bf16_2 */
#if defined(__clang__)
__BF16_HOST_DEVICE__ __hip_bfloat162(const __bf16_2 in) : __xy_bf162(in) {} __BF16_HOST_DEVICE__ __hip_bfloat162(const __bf16_2 in) : __xy_bf162(in) {}
#else
__BF16_HOST_DEVICE__ __hip_bfloat162(const __bf16_2 in) : x{in[0]}, y{in[1]} {}
#endif
/*! \brief default constructor of __hip_bfloat162 */ /*! \brief default constructor of __hip_bfloat162 */
__BF16_HOST_DEVICE__ __hip_bfloat162() = default; __BF16_HOST_DEVICE__ __hip_bfloat162() = default;
@@ -392,11 +405,22 @@ struct __attribute__((aligned(4))) __hip_bfloat162 {
} }
/*! \brief return a vector of bf16 */ /*! \brief return a vector of bf16 */
__BF16_HOST_DEVICE__ operator __bf16_2() const { return __xy_bf162; } __BF16_HOST_DEVICE__ operator __bf16_2() const {
#if defined(__clang__)
return __xy_bf162;
#else
return __bf16_2{x, y};
#endif
}
/*! \brief return a vector of bf16 */ /*! \brief return a vector of bf16 */
__BF16_HOST_DEVICE__ __hip_bfloat162& operator=(const __bf16_2 in) { __BF16_HOST_DEVICE__ __hip_bfloat162& operator=(const __bf16_2 in) {
#if defined(__clang__)
__xy_bf162 = in; __xy_bf162 = in;
#else
x = __hip_bfloat16{in[0]};
y = __hip_bfloat16{in[1]};
#endif
return *this; return *this;
} }
@@ -835,6 +859,7 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hdiv(const __hip_bfloat16 a, const
return (__bf16)a / (__bf16)b; return (__bf16)a / (__bf16)b;
} }
#if defined(__clang__) && defined(__HIP__)
/** /**
* \ingroup HIP_INTRINSIC_BFLOAT16_ARITH * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
* \brief Performs FMA of given bfloat16 values * \brief Performs FMA of given bfloat16 values
@@ -844,6 +869,7 @@ __BF16_DEVICE_STATIC__ __hip_bfloat16 __hfma(const __hip_bfloat16 a, const __hip
return __hip_bfloat16(__builtin_elementwise_fma(__bf16(a), __bf16(b), __bf16(c))); return __hip_bfloat16(__builtin_elementwise_fma(__bf16(a), __bf16(b), __bf16(c)));
; ;
} }
#endif
/** /**
* \ingroup HIP_INTRINSIC_BFLOAT16_ARITH * \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
@@ -919,6 +945,8 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hadd2_rn(const __hip_bfloat162 a,
return __hip_bfloat162{__bf16_2(a) + __bf16_2(b)}; return __hip_bfloat162{__bf16_2(a) + __bf16_2(b)};
} }
#if defined(__clang__) && defined(__HIP__)
/** /**
* \ingroup HIP_INTRINSIC_BFLOAT162_ARITH * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
* \brief Performs FMA of given bfloat162 values * \brief Performs FMA of given bfloat162 values
@@ -927,6 +955,7 @@ __BF16_DEVICE_STATIC__ __hip_bfloat162 __hfma2(const __hip_bfloat162 a, const __
const __hip_bfloat162 c) { const __hip_bfloat162 c) {
return __hip_bfloat162{__builtin_elementwise_fma(__bf16_2(a), __bf16_2(b), __bf16_2(c))}; return __hip_bfloat162{__builtin_elementwise_fma(__bf16_2(a), __bf16_2(b), __bf16_2(c))};
} }
#endif
/** /**
* \ingroup HIP_INTRINSIC_BFLOAT162_ARITH * \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
@@ -1639,6 +1668,7 @@ __BF16_HOST_DEVICE_STATIC__ bool operator>=(const __hip_bfloat162& l, const __hi
return fl.x >= fr.x && fl.x >= fr.y; return fl.x >= fr.x && fl.x >= fr.y;
} }
#if defined(__clang__) && defined(__HIP__)
/** /**
* \ingroup HIP_INTRINSIC_BFLOAT16_MATH * \ingroup HIP_INTRINSIC_BFLOAT16_MATH
* \brief Calculate ceil of bfloat16 * \brief Calculate ceil of bfloat16
@@ -1883,7 +1913,6 @@ __BF16_DEVICE_STATIC__ __hip_bfloat162 h2trunc(const __hip_bfloat162 h) {
return __hip_bfloat162(htrunc(h.x), htrunc(h.y)); return __hip_bfloat162(htrunc(h.x), htrunc(h.y));
} }
#if defined(__clang__) && defined(__HIP__)
/** /**
* \ingroup HIP_INTRINSIC_BFLOAT162_MATH * \ingroup HIP_INTRINSIC_BFLOAT162_MATH
* \brief Atomic add bfloat162 * \brief Atomic add bfloat162
@@ -212,6 +212,19 @@ class grid_group : public thread_group {
//! @copydoc thread_group::sync //! @copydoc thread_group::sync
__CG_QUALIFIER__ void sync() const { internal::grid::sync(); } __CG_QUALIFIER__ void sync() const { internal::grid::sync(); }
__CG_QUALIFIER__ dim3 group_dim() const { return internal::grid::grid_dim(); } __CG_QUALIFIER__ dim3 group_dim() const { return internal::grid::grid_dim(); }
struct arrival_token {
unsigned int signal;
};
//! Arrive at a barrier
__CG_QUALIFIER__ arrival_token barrier_arrive() const {
arrival_token t;
t.signal = internal::grid::barrier_signal();
return t;
}
//! Arrive at a barrier
__CG_QUALIFIER__ void barrier_wait(arrival_token&& t) const {
internal::grid::barrier_wait(t.signal);
}
}; };
/** \ingroup CooperativeGConstruct /** \ingroup CooperativeGConstruct
@@ -295,6 +308,14 @@ class thread_block : public thread_group {
__CG_STATIC_QUALIFIER__ void sync() { internal::workgroup::sync(); } __CG_STATIC_QUALIFIER__ void sync() { internal::workgroup::sync(); }
//! Returns the group dimensions. //! Returns the group dimensions.
__CG_QUALIFIER__ dim3 group_dim() { return internal::workgroup::block_dim(); } __CG_QUALIFIER__ dim3 group_dim() { return internal::workgroup::block_dim(); }
struct arrival_token {};
//! Arrive at a barrier
__CG_QUALIFIER__ arrival_token barrier_arrive() const {
internal::workgroup::barrier_arrive();
return arrival_token{};
}
//! Arrive at a barrier
__CG_QUALIFIER__ void barrier_wait(arrival_token&&) const { internal::workgroup::barrier_wait(); }
}; };
/** \ingroup CooperativeGConstruct /** \ingroup CooperativeGConstruct
@@ -97,6 +97,8 @@ extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_size(void);
extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_thread_rank(void); extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_thread_rank(void);
extern "C" __device__ __attribute__((const)) int __ockl_multi_grid_is_valid(void); extern "C" __device__ __attribute__((const)) int __ockl_multi_grid_is_valid(void);
extern "C" __device__ __attribute__((convergent)) void __ockl_multi_grid_sync(void); extern "C" __device__ __attribute__((convergent)) void __ockl_multi_grid_sync(void);
extern "C" __device__ __attribute__((const)) uint __ockl_grid_bar_arrive(void);
extern "C" __device__ __attribute__((convergent)) void __ockl_grid_bar_wait(uint);
extern "C" __device__ void __ockl_atomic_add_noret_f32(float*, float); extern "C" __device__ void __ockl_atomic_add_noret_f32(float*, float);
@@ -63,7 +63,7 @@
#define HIP_API_TABLE_STEP_VERSION 0 #define HIP_API_TABLE_STEP_VERSION 0
#define HIP_COMPILER_API_TABLE_STEP_VERSION 0 #define HIP_COMPILER_API_TABLE_STEP_VERSION 0
#define HIP_TOOLS_API_TABLE_STEP_VERSION 0 #define HIP_TOOLS_API_TABLE_STEP_VERSION 0
#define HIP_RUNTIME_API_TABLE_STEP_VERSION 20 #define HIP_RUNTIME_API_TABLE_STEP_VERSION 21
// HIP API interface // HIP API interface
// HIP compiler dispatch functions // HIP compiler dispatch functions
@@ -1113,6 +1113,9 @@ typedef hipError_t (*t_hipKernelGetLibrary)(hipLibrary_t* library, hipKernel_t k
typedef hipError_t (*t_hipKernelGetName)(const char** name, hipKernel_t kernel); typedef hipError_t (*t_hipKernelGetName)(const char** name, hipKernel_t kernel);
typedef hipError_t (*t_hipGetProcAddress_spt)(const char* symbol, void** pfn, int hipVersion, uint64_t flags, typedef hipError_t (*t_hipGetProcAddress_spt)(const char* symbol, void** pfn, int hipVersion, uint64_t flags,
hipDriverProcAddressQueryResult* symbolStatus); hipDriverProcAddressQueryResult* symbolStatus);
typedef hipError_t (*t_hipExtDisableLogging)();
typedef hipError_t (*t_hipExtEnableLogging)();
typedef hipError_t (*t_hipExtSetLoggingParams)(size_t log_level, size_t log_size, size_t log_mask);
typedef hipError_t (*t_hipKernelGetParamInfo)(hipKernel_t kernel, size_t paramIndex, typedef hipError_t (*t_hipKernelGetParamInfo)(hipKernel_t kernel, size_t paramIndex,
size_t* paramOffset, size_t* paramSize); size_t* paramOffset, size_t* paramSize);
@@ -1707,8 +1710,13 @@ struct HipDispatchTable {
// HIP_RUNTIME_API_TABLE_STEP_VERSION == 20 // HIP_RUNTIME_API_TABLE_STEP_VERSION == 20
t_hipKernelGetParamInfo hipKernelGetParamInfo_fn; t_hipKernelGetParamInfo hipKernelGetParamInfo_fn;
// DO NOT EDIT ABOVE!
// HIP_RUNTIME_API_TABLE_STEP_VERSION == 21 // HIP_RUNTIME_API_TABLE_STEP_VERSION == 21
t_hipExtDisableLogging hipExtDisableLogging_fn;
t_hipExtEnableLogging hipExtEnableLogging_fn;
t_hipExtSetLoggingParams hipExtSetLoggingParams_fn;
// DO NOT EDIT ABOVE!
// HIP_RUNTIME_API_TABLE_STEP_VERSION == 22
// ******************************************************************************************* // // ******************************************************************************************* //
// //
@@ -196,6 +196,9 @@ __CG_STATIC_QUALIFIER__ dim3 grid_dim() {
static_cast<__hip_uint32_t>(gridDim.z))); static_cast<__hip_uint32_t>(gridDim.z)));
} }
__CG_STATIC_QUALIFIER__ unsigned int barrier_signal() { return __ockl_grid_bar_arrive(); }
__CG_STATIC_QUALIFIER__ void barrier_wait(unsigned int s) { __ockl_grid_bar_wait(s); }
} // namespace grid } // namespace grid
/** /**
@@ -238,6 +241,23 @@ __CG_STATIC_QUALIFIER__ dim3 block_dim() {
static_cast<__hip_uint32_t>(blockDim.z))); static_cast<__hip_uint32_t>(blockDim.z)));
} }
__CG_STATIC_QUALIFIER__ void barrier_arrive() {
__builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
#if __has_builtin(__builtin_amdgcn_s_barrier_signal) && \
__has_builtin(__builtin_amdgcn_s_barrier_wait)
__builtin_amdgcn_s_barrier_signal(-1);
#endif // __builtin_amdgcn_s_barrier_signal && __builtin_amdgcn_s_barrier_wait
}
__CG_STATIC_QUALIFIER__ void barrier_wait() {
#if __has_builtin(__builtin_amdgcn_s_barrier_signal) && \
__has_builtin(__builtin_amdgcn_s_barrier_wait)
__builtin_amdgcn_s_barrier_wait(-1);
#else
__builtin_amdgcn_s_barrier();
#endif // __builtin_amdgcn_s_barrier_signal && __builtin_amdgcn_s_barrier_wait
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
}
} // namespace workgroup } // namespace workgroup
namespace tiled_group { namespace tiled_group {
@@ -468,7 +468,10 @@ enum hip_api_id_t {
HIP_API_ID_hipKernelGetName = 448, HIP_API_ID_hipKernelGetName = 448,
HIP_API_ID_hipOccupancyAvailableDynamicSMemPerBlock = 449, HIP_API_ID_hipOccupancyAvailableDynamicSMemPerBlock = 449,
HIP_API_ID_hipKernelGetParamInfo = 450, HIP_API_ID_hipKernelGetParamInfo = 450,
HIP_API_ID_LAST = 450, HIP_API_ID_hipExtDisableLogging = 451,
HIP_API_ID_hipExtEnableLogging = 452,
HIP_API_ID_hipExtSetLoggingParams = 453,
HIP_API_ID_LAST = 453,
HIP_API_ID_hipChooseDevice = HIP_API_ID_CONCAT(HIP_API_ID_,hipChooseDevice), HIP_API_ID_hipChooseDevice = HIP_API_ID_CONCAT(HIP_API_ID_,hipChooseDevice),
HIP_API_ID_hipGetDeviceProperties = HIP_API_ID_CONCAT(HIP_API_ID_,hipGetDeviceProperties), HIP_API_ID_hipGetDeviceProperties = HIP_API_ID_CONCAT(HIP_API_ID_,hipGetDeviceProperties),
@@ -590,12 +593,15 @@ static inline const char* hip_api_name(const uint32_t id) {
case HIP_API_ID_hipEventRecord: return "hipEventRecord"; case HIP_API_ID_hipEventRecord: return "hipEventRecord";
case HIP_API_ID_hipEventRecordWithFlags: return "hipEventRecordWithFlags"; case HIP_API_ID_hipEventRecordWithFlags: return "hipEventRecordWithFlags";
case HIP_API_ID_hipEventSynchronize: return "hipEventSynchronize"; case HIP_API_ID_hipEventSynchronize: return "hipEventSynchronize";
case HIP_API_ID_hipExtDisableLogging: return "hipExtDisableLogging";
case HIP_API_ID_hipExtEnableLogging: return "hipExtEnableLogging";
case HIP_API_ID_hipExtGetLastError: return "hipExtGetLastError"; case HIP_API_ID_hipExtGetLastError: return "hipExtGetLastError";
case HIP_API_ID_hipExtGetLinkTypeAndHopCount: return "hipExtGetLinkTypeAndHopCount"; case HIP_API_ID_hipExtGetLinkTypeAndHopCount: return "hipExtGetLinkTypeAndHopCount";
case HIP_API_ID_hipExtLaunchKernel: return "hipExtLaunchKernel"; case HIP_API_ID_hipExtLaunchKernel: return "hipExtLaunchKernel";
case HIP_API_ID_hipExtLaunchMultiKernelMultiDevice: return "hipExtLaunchMultiKernelMultiDevice"; case HIP_API_ID_hipExtLaunchMultiKernelMultiDevice: return "hipExtLaunchMultiKernelMultiDevice";
case HIP_API_ID_hipExtMallocWithFlags: return "hipExtMallocWithFlags"; case HIP_API_ID_hipExtMallocWithFlags: return "hipExtMallocWithFlags";
case HIP_API_ID_hipExtModuleLaunchKernel: return "hipExtModuleLaunchKernel"; case HIP_API_ID_hipExtModuleLaunchKernel: return "hipExtModuleLaunchKernel";
case HIP_API_ID_hipExtSetLoggingParams: return "hipExtSetLoggingParams";
case HIP_API_ID_hipExtStreamCreateWithCUMask: return "hipExtStreamCreateWithCUMask"; case HIP_API_ID_hipExtStreamCreateWithCUMask: return "hipExtStreamCreateWithCUMask";
case HIP_API_ID_hipExtStreamGetCUMask: return "hipExtStreamGetCUMask"; case HIP_API_ID_hipExtStreamGetCUMask: return "hipExtStreamGetCUMask";
case HIP_API_ID_hipExternalMemoryGetMappedBuffer: return "hipExternalMemoryGetMappedBuffer"; case HIP_API_ID_hipExternalMemoryGetMappedBuffer: return "hipExternalMemoryGetMappedBuffer";
@@ -1034,12 +1040,15 @@ static inline uint32_t hipApiIdByName(const char* name) {
if (strcmp("hipEventRecord", name) == 0) return HIP_API_ID_hipEventRecord; if (strcmp("hipEventRecord", name) == 0) return HIP_API_ID_hipEventRecord;
if (strcmp("hipEventRecordWithFlags", name) == 0) return HIP_API_ID_hipEventRecordWithFlags; if (strcmp("hipEventRecordWithFlags", name) == 0) return HIP_API_ID_hipEventRecordWithFlags;
if (strcmp("hipEventSynchronize", name) == 0) return HIP_API_ID_hipEventSynchronize; if (strcmp("hipEventSynchronize", name) == 0) return HIP_API_ID_hipEventSynchronize;
if (strcmp("hipExtDisableLogging", name) == 0) return HIP_API_ID_hipExtDisableLogging;
if (strcmp("hipExtEnableLogging", name) == 0) return HIP_API_ID_hipExtEnableLogging;
if (strcmp("hipExtGetLastError", name) == 0) return HIP_API_ID_hipExtGetLastError; if (strcmp("hipExtGetLastError", name) == 0) return HIP_API_ID_hipExtGetLastError;
if (strcmp("hipExtGetLinkTypeAndHopCount", name) == 0) return HIP_API_ID_hipExtGetLinkTypeAndHopCount; if (strcmp("hipExtGetLinkTypeAndHopCount", name) == 0) return HIP_API_ID_hipExtGetLinkTypeAndHopCount;
if (strcmp("hipExtLaunchKernel", name) == 0) return HIP_API_ID_hipExtLaunchKernel; if (strcmp("hipExtLaunchKernel", name) == 0) return HIP_API_ID_hipExtLaunchKernel;
if (strcmp("hipExtLaunchMultiKernelMultiDevice", name) == 0) return HIP_API_ID_hipExtLaunchMultiKernelMultiDevice; if (strcmp("hipExtLaunchMultiKernelMultiDevice", name) == 0) return HIP_API_ID_hipExtLaunchMultiKernelMultiDevice;
if (strcmp("hipExtMallocWithFlags", name) == 0) return HIP_API_ID_hipExtMallocWithFlags; if (strcmp("hipExtMallocWithFlags", name) == 0) return HIP_API_ID_hipExtMallocWithFlags;
if (strcmp("hipExtModuleLaunchKernel", name) == 0) return HIP_API_ID_hipExtModuleLaunchKernel; if (strcmp("hipExtModuleLaunchKernel", name) == 0) return HIP_API_ID_hipExtModuleLaunchKernel;
if (strcmp("hipExtSetLoggingParams", name) == 0) return HIP_API_ID_hipExtSetLoggingParams;
if (strcmp("hipExtStreamCreateWithCUMask", name) == 0) return HIP_API_ID_hipExtStreamCreateWithCUMask; if (strcmp("hipExtStreamCreateWithCUMask", name) == 0) return HIP_API_ID_hipExtStreamCreateWithCUMask;
if (strcmp("hipExtStreamGetCUMask", name) == 0) return HIP_API_ID_hipExtStreamGetCUMask; if (strcmp("hipExtStreamGetCUMask", name) == 0) return HIP_API_ID_hipExtStreamGetCUMask;
if (strcmp("hipExternalMemoryGetMappedBuffer", name) == 0) return HIP_API_ID_hipExternalMemoryGetMappedBuffer; if (strcmp("hipExternalMemoryGetMappedBuffer", name) == 0) return HIP_API_ID_hipExternalMemoryGetMappedBuffer;
@@ -1851,6 +1860,11 @@ typedef struct hip_api_data_s {
hipEvent_t stopEvent; hipEvent_t stopEvent;
unsigned int flags; unsigned int flags;
} hipExtModuleLaunchKernel; } hipExtModuleLaunchKernel;
struct {
size_t log_level;
size_t log_size;
size_t log_mask;
} hipExtSetLoggingParams;
struct { struct {
hipStream_t* stream; hipStream_t* stream;
hipStream_t stream__val; hipStream_t stream__val;
@@ -4484,6 +4498,12 @@ typedef struct hip_api_data_s {
#define INIT_hipEventSynchronize_CB_ARGS_DATA(cb_data) { \ #define INIT_hipEventSynchronize_CB_ARGS_DATA(cb_data) { \
cb_data.args.hipEventSynchronize.event = (hipEvent_t)event; \ cb_data.args.hipEventSynchronize.event = (hipEvent_t)event; \
}; };
// hipExtDisableLogging[]
#define INIT_hipExtDisableLogging_CB_ARGS_DATA(cb_data) { \
};
// hipExtEnableLogging[]
#define INIT_hipExtEnableLogging_CB_ARGS_DATA(cb_data) { \
};
// hipExtGetLastError[] // hipExtGetLastError[]
#define INIT_hipExtGetLastError_CB_ARGS_DATA(cb_data) { \ #define INIT_hipExtGetLastError_CB_ARGS_DATA(cb_data) { \
}; };
@@ -4535,6 +4555,12 @@ typedef struct hip_api_data_s {
cb_data.args.hipExtModuleLaunchKernel.stopEvent = (hipEvent_t)stopEvent; \ cb_data.args.hipExtModuleLaunchKernel.stopEvent = (hipEvent_t)stopEvent; \
cb_data.args.hipExtModuleLaunchKernel.flags = (unsigned int)flags; \ cb_data.args.hipExtModuleLaunchKernel.flags = (unsigned int)flags; \
}; };
// hipExtSetLoggingParams[('size_t', 'log_level'), ('size_t', 'log_size'), ('size_t', 'log_mask')]
#define INIT_hipExtSetLoggingParams_CB_ARGS_DATA(cb_data) { \
cb_data.args.hipExtSetLoggingParams.log_level = (size_t)log_level; \
cb_data.args.hipExtSetLoggingParams.log_size = (size_t)log_size; \
cb_data.args.hipExtSetLoggingParams.log_mask = (size_t)log_mask; \
};
// hipExtStreamCreateWithCUMask[('hipStream_t*', 'stream'), ('unsigned int', 'cuMaskSize'), ('const unsigned int*', 'cuMask')] // hipExtStreamCreateWithCUMask[('hipStream_t*', 'stream'), ('unsigned int', 'cuMaskSize'), ('const unsigned int*', 'cuMask')]
#define INIT_hipExtStreamCreateWithCUMask_CB_ARGS_DATA(cb_data) { \ #define INIT_hipExtStreamCreateWithCUMask_CB_ARGS_DATA(cb_data) { \
cb_data.args.hipExtStreamCreateWithCUMask.stream = (hipStream_t*)stream; \ cb_data.args.hipExtStreamCreateWithCUMask.stream = (hipStream_t*)stream; \
@@ -7125,6 +7151,12 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) {
// hipEventSynchronize[('hipEvent_t', 'event')] // hipEventSynchronize[('hipEvent_t', 'event')]
case HIP_API_ID_hipEventSynchronize: case HIP_API_ID_hipEventSynchronize:
break; break;
// hipExtDisableLogging[]
case HIP_API_ID_hipExtDisableLogging:
break;
// hipExtEnableLogging[]
case HIP_API_ID_hipExtEnableLogging:
break;
// hipExtGetLastError[] // hipExtGetLastError[]
case HIP_API_ID_hipExtGetLastError: case HIP_API_ID_hipExtGetLastError:
break; break;
@@ -7150,6 +7182,9 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) {
if (data->args.hipExtModuleLaunchKernel.kernelParams) data->args.hipExtModuleLaunchKernel.kernelParams__val = *(data->args.hipExtModuleLaunchKernel.kernelParams); if (data->args.hipExtModuleLaunchKernel.kernelParams) data->args.hipExtModuleLaunchKernel.kernelParams__val = *(data->args.hipExtModuleLaunchKernel.kernelParams);
if (data->args.hipExtModuleLaunchKernel.extra) data->args.hipExtModuleLaunchKernel.extra__val = *(data->args.hipExtModuleLaunchKernel.extra); if (data->args.hipExtModuleLaunchKernel.extra) data->args.hipExtModuleLaunchKernel.extra__val = *(data->args.hipExtModuleLaunchKernel.extra);
break; break;
// hipExtSetLoggingParams[('size_t', 'log_level'), ('size_t', 'log_size'), ('size_t', 'log_mask')]
case HIP_API_ID_hipExtSetLoggingParams:
break;
// hipExtStreamCreateWithCUMask[('hipStream_t*', 'stream'), ('unsigned int', 'cuMaskSize'), ('const unsigned int*', 'cuMask')] // hipExtStreamCreateWithCUMask[('hipStream_t*', 'stream'), ('unsigned int', 'cuMaskSize'), ('const unsigned int*', 'cuMask')]
case HIP_API_ID_hipExtStreamCreateWithCUMask: case HIP_API_ID_hipExtStreamCreateWithCUMask:
if (data->args.hipExtStreamCreateWithCUMask.stream) data->args.hipExtStreamCreateWithCUMask.stream__val = *(data->args.hipExtStreamCreateWithCUMask.stream); if (data->args.hipExtStreamCreateWithCUMask.stream) data->args.hipExtStreamCreateWithCUMask.stream__val = *(data->args.hipExtStreamCreateWithCUMask.stream);
@@ -9124,6 +9159,14 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da
oss << "event="; roctracer::hip_support::detail::operator<<(oss, data->args.hipEventSynchronize.event); oss << "event="; roctracer::hip_support::detail::operator<<(oss, data->args.hipEventSynchronize.event);
oss << ")"; oss << ")";
break; break;
case HIP_API_ID_hipExtDisableLogging:
oss << "hipExtDisableLogging(";
oss << ")";
break;
case HIP_API_ID_hipExtEnableLogging:
oss << "hipExtEnableLogging(";
oss << ")";
break;
case HIP_API_ID_hipExtGetLastError: case HIP_API_ID_hipExtGetLastError:
oss << "hipExtGetLastError("; oss << "hipExtGetLastError(";
oss << ")"; oss << ")";
@@ -9188,6 +9231,13 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da
oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtModuleLaunchKernel.flags); oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtModuleLaunchKernel.flags);
oss << ")"; oss << ")";
break; break;
case HIP_API_ID_hipExtSetLoggingParams:
oss << "hipExtSetLoggingParams(";
oss << "log_level="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtSetLoggingParams.log_level);
oss << ", log_size="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtSetLoggingParams.log_size);
oss << ", log_mask="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtSetLoggingParams.log_mask);
oss << ")";
break;
case HIP_API_ID_hipExtStreamCreateWithCUMask: case HIP_API_ID_hipExtStreamCreateWithCUMask:
oss << "hipExtStreamCreateWithCUMask("; oss << "hipExtStreamCreateWithCUMask(";
if (data->args.hipExtStreamCreateWithCUMask.stream == NULL) oss << "stream=NULL"; if (data->args.hipExtStreamCreateWithCUMask.stream == NULL) oss << "stream=NULL";
+1
Visa fil
@@ -110,6 +110,7 @@ target_sources(amdhip64 PRIVATE
hip_graph.cpp hip_graph.cpp
hip_hmm.cpp hip_hmm.cpp
hip_intercept.cpp hip_intercept.cpp
hip_log.cpp
hip_memory.cpp hip_memory.cpp
hip_mempool.cpp hip_mempool.cpp
hip_mempool_impl.cpp hip_mempool_impl.cpp
+3
Visa fil
@@ -523,3 +523,6 @@ hipKernelGetName
hipOccupancyAvailableDynamicSMemPerBlock hipOccupancyAvailableDynamicSMemPerBlock
hipGetProcAddress_spt hipGetProcAddress_spt
hipKernelGetParamInfo hipKernelGetParamInfo
hipExtDisableLogging
hipExtEnableLogging
hipExtSetLoggingParams
+12 -2
Visa fil
@@ -885,6 +885,9 @@ hipError_t hipOccupancyAvailableDynamicSMemPerBlock(size_t* dynamicSmemSize, con
int numBlocks, int blockSize); int numBlocks, int blockSize);
hipError_t hipKernelGetParamInfo(hipKernel_t kernel, size_t paramIndex, size_t* paramOffset, hipError_t hipKernelGetParamInfo(hipKernel_t kernel, size_t paramIndex, size_t* paramOffset,
size_t* paramSize); size_t* paramSize);
hipError_t hipExtDisableLogging();
hipError_t hipExtEnableLogging();
hipError_t hipExtSetLoggingParams(size_t log_level, size_t log_size, size_t log_mask);
} // namespace hip } // namespace hip
namespace hip { namespace hip {
@@ -1432,6 +1435,9 @@ void UpdateDispatchTable(HipDispatchTable* ptrDispatchTable) {
ptrDispatchTable->hipKernelGetName_fn = hip::hipKernelGetName; ptrDispatchTable->hipKernelGetName_fn = hip::hipKernelGetName;
ptrDispatchTable->hipOccupancyAvailableDynamicSMemPerBlock_fn = hip::hipOccupancyAvailableDynamicSMemPerBlock; ptrDispatchTable->hipOccupancyAvailableDynamicSMemPerBlock_fn = hip::hipOccupancyAvailableDynamicSMemPerBlock;
ptrDispatchTable->hipKernelGetParamInfo_fn = hip::hipKernelGetParamInfo; ptrDispatchTable->hipKernelGetParamInfo_fn = hip::hipKernelGetParamInfo;
ptrDispatchTable->hipExtDisableLogging_fn = hip::hipExtDisableLogging;
ptrDispatchTable->hipExtEnableLogging_fn = hip::hipExtEnableLogging;
ptrDispatchTable->hipExtSetLoggingParams_fn = hip::hipExtSetLoggingParams;
} }
#if HIP_ROCPROFILER_REGISTER > 0 #if HIP_ROCPROFILER_REGISTER > 0
@@ -2114,15 +2120,19 @@ HIP_ENFORCE_ABI(HipDispatchTable, hipOccupancyAvailableDynamicSMemPerBlock_fn, 5
HIP_ENFORCE_ABI(HipDispatchTable, hipGetProcAddress_spt_fn, 506); HIP_ENFORCE_ABI(HipDispatchTable, hipGetProcAddress_spt_fn, 506);
// HIP_RUNTIME_API_TABLE_STEP_VERSION == 20 // HIP_RUNTIME_API_TABLE_STEP_VERSION == 20
HIP_ENFORCE_ABI(HipDispatchTable, hipKernelGetParamInfo_fn, 507); HIP_ENFORCE_ABI(HipDispatchTable, hipKernelGetParamInfo_fn, 507);
// HIP_RUNTIME_API_TABLE_STEP_VERSION == 21
HIP_ENFORCE_ABI(HipDispatchTable, hipExtDisableLogging_fn, 508);
HIP_ENFORCE_ABI(HipDispatchTable, hipExtEnableLogging_fn, 509);
HIP_ENFORCE_ABI(HipDispatchTable, hipExtSetLoggingParams_fn, 510);
// if HIP_ENFORCE_ABI entries are added for each new function pointer in the table, the number below // if HIP_ENFORCE_ABI entries are added for each new function pointer in the table, the number below
// will be +1 of the number in the last HIP_ENFORCE_ABI line. E.g.: // will be +1 of the number in the last HIP_ENFORCE_ABI line. E.g.:
// //
// HIP_ENFORCE_ABI(<table>, <functor>, 8) // HIP_ENFORCE_ABI(<table>, <functor>, 8)
// //
// HIP_ENFORCE_ABI_VERSIONING(<table>, 9) <- 8 + 1 = 9 // HIP_ENFORCE_ABI_VERSIONING(<table>, 9) <- 8 + 1 = 9
HIP_ENFORCE_ABI_VERSIONING(HipDispatchTable, 508) HIP_ENFORCE_ABI_VERSIONING(HipDispatchTable, 511)
static_assert(HIP_RUNTIME_API_TABLE_MAJOR_VERSION == 0 && HIP_RUNTIME_API_TABLE_STEP_VERSION == 20, static_assert(HIP_RUNTIME_API_TABLE_MAJOR_VERSION == 0 && HIP_RUNTIME_API_TABLE_STEP_VERSION == 21,
"If you get this error, add new HIP_ENFORCE_ABI(...) code for the new function " "If you get this error, add new HIP_ENFORCE_ABI(...) code for the new function "
"pointers and then update this check so it is true"); "pointers and then update this check so it is true");
#endif #endif
+3
Visa fil
@@ -645,6 +645,9 @@ global:
hipOccupancyAvailableDynamicSMemPerBlock; hipOccupancyAvailableDynamicSMemPerBlock;
hipGetProcAddress_spt; hipGetProcAddress_spt;
hipKernelGetParamInfo; hipKernelGetParamInfo;
hipExtDisableLogging;
hipExtEnableLogging;
hipExtSetLoggingParams;
local: local:
*; *;
} hip_7.1; } hip_7.1;
+31
Visa fil
@@ -0,0 +1,31 @@
#include <hip/hip_runtime.h>
#include "hip_internal.hpp"
#include "hip_platform.hpp"
namespace hip {
hipError_t hipExtEnableLogging() {
HIP_INIT_API(hipExtEnableLogging);
amd::ScopedLock lock(PlatformState::instance().getLogLock());
AMD_LOG_LEVEL = PlatformState::instance().log_level_;
AMD_LOG_MASK = PlatformState::instance().log_mask_;
HIP_RETURN(hipSuccess);
}
hipError_t hipExtDisableLogging() {
HIP_INIT_API(hipExtDisableLogging);
amd::ScopedLock lock(PlatformState::instance().getLogLock());
AMD_LOG_LEVEL = 0;
HIP_RETURN(hipSuccess);
}
hipError_t hipExtSetLoggingParams(size_t log_level, size_t log_size, size_t log_mask) {
HIP_INIT_API(hipExtSetLoggingParams, log_level, log_size, log_mask);
amd::ScopedLock lock(PlatformState::instance().getLogLock());
// Store logging parameters for later activation
PlatformState::instance().log_level_ = log_level;
PlatformState::instance().log_size_ = log_size;
PlatformState::instance().log_mask_ = log_mask;
HIP_RETURN(hipSuccess);
}
} // namespace::hip
+17 -1
Visa fil
@@ -50,9 +50,12 @@ class PlatformState {
// Unique FD Store Lock // Unique FD Store Lock
amd::Monitor ufd_lock_{true}; amd::Monitor ufd_lock_{true};
// Lock for logging operations
amd::Monitor lg_lock_{true};
// Singleton object // Singleton object
static PlatformState* platform_; static PlatformState* platform_;
PlatformState() {} PlatformState() : log_level_(0), log_size_(0), log_mask_(0) {}
~PlatformState() {} ~PlatformState() {}
public: public:
@@ -113,6 +116,14 @@ class PlatformState {
size_t UfdMapSize() const { return ufd_map_.size(); } size_t UfdMapSize() const { return ufd_map_.size(); }
// Logging lock accessor
amd::Monitor& getLogLock() { return lg_lock_; }
// Friend functions for logging access
friend hipError_t hipExtEnableLogging();
friend hipError_t hipExtDisableLogging();
friend hipError_t hipExtSetLoggingParams(size_t log_level, size_t log_size, size_t log_mask);
inline bool RegisterLibraryFunction(const hipKernel_t f, const hipLibrary_t l) { inline bool RegisterLibraryFunction(const hipKernel_t f, const hipLibrary_t l) {
amd::ScopedLock lock(lock_); amd::ScopedLock lock(lock_);
if (library_functions_.find(f) == library_functions_.end()) { if (library_functions_.find(f) == library_functions_.end()) {
@@ -150,5 +161,10 @@ class PlatformState {
void* dynamicLibraryHandle_{nullptr}; void* dynamicLibraryHandle_{nullptr};
std::unordered_map<hipKernel_t, hipLibrary_t> library_functions_; std::unordered_map<hipKernel_t, hipLibrary_t> library_functions_;
// Logging state (moved from LoggingInfo singleton)
size_t log_level_;
size_t log_size_;
size_t log_mask_;
}; };
} // namespace hip } // namespace hip
@@ -2067,4 +2067,13 @@ hipError_t hipKernelGetParamInfo(hipKernel_t kernel, size_t paramIndex, size_t*
size_t* paramSize) { size_t* paramSize) {
return hip::GetHipDispatchTable()->hipKernelGetParamInfo_fn(kernel, paramIndex, paramOffset, return hip::GetHipDispatchTable()->hipKernelGetParamInfo_fn(kernel, paramIndex, paramOffset,
paramSize); paramSize);
}
hipError_t hipExtEnableLogging() {
return hip::GetHipDispatchTable()->hipExtEnableLogging_fn();
}
hipError_t hipExtDisableLogging() {
return hip::GetHipDispatchTable()->hipExtDisableLogging_fn();
}
hipError_t hipExtSetLoggingParams(size_t log_level, size_t log_size, size_t log_mask) {
return hip::GetHipDispatchTable()->hipExtSetLoggingParams_fn(log_level, log_size, log_mask);
} }
@@ -138,14 +138,6 @@
"=== TODO ===", "=== TODO ===",
"Unit_Device_tgammaf_Accuracy_Limited_Positive", "Unit_Device_tgammaf_Accuracy_Limited_Positive",
"=== TODO === fail on 100% test data", "=== TODO === fail on 100% test data",
"Unit_Device_hexp10_Accuracy_Positive",
"Unit_Device_h2exp10_Accuracy_Positive",
"Unit_Device_hexp2_Accuracy_Positive",
"Unit_Device_h2exp2_Accuracy_Positive",
"Unit_Device_hlog_Accuracy_Positive",
"Unit_Device_h2log_Accuracy_Positive",
"Unit_Device_hlog10_Accuracy_Positive",
"Unit_Device_h2log10_Accuracy_Positive",
"Unit_Device___hfma2_Accuracy_Positive", "Unit_Device___hfma2_Accuracy_Positive",
#endif #endif
#if defined gfx90a || defined gfx942 || defined gfx950 #if defined gfx90a || defined gfx942 || defined gfx950
@@ -752,6 +752,8 @@
#endif #endif
"=== Following tests disabled as it should be a local perf test", "=== Following tests disabled as it should be a local perf test",
"Performance_hipExtLaunchKernelGGL_QueryGPUFrequency", "Performance_hipExtLaunchKernelGGL_QueryGPUFrequency",
"Unit_hipDynamicLogging_Positive_Basic",
"Unit_hipDynamicLogging_Positive_MultipleEnableDisable",
"End of json" "End of json"
] ]
} }
@@ -20,6 +20,7 @@ set(TEST_SRC
binary_partition.cc binary_partition.cc
cg_ballot.cc cg_ballot.cc
cg_any_all.cc cg_any_all.cc
split_barrier.cc
) )
if(HIP_PLATFORM STREQUAL "nvidia") if(HIP_PLATFORM STREQUAL "nvidia")
@@ -0,0 +1,123 @@
/*
Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip/hip_cooperative_groups.h>
#include <hip_test_common.hh>
static __global__ void wg_split_barrier(float *out, float *in) {
namespace cg = cooperative_groups;
__shared__ float mid[32];
size_t i = threadIdx.x;
auto tb = cg::this_thread_block();
out[i] = in[i] * 2.0f;
auto tok = tb.barrier_arrive();
// use tid 0 to populate shared mem
if (i == 0) {
for (size_t j = 0; j < 32; j++) {
mid[j] = in[j];
}
}
tb.barrier_wait(std::move(tok));
out[i] += mid[i];
}
TEST_CASE("Unit_coop_thread_block_split_barrier") {
constexpr size_t size = 32;
float *d_out, *d_in;
HIP_CHECK(hipMalloc(&d_out, sizeof(float) * size));
HIP_CHECK(hipMalloc(&d_in, sizeof(float) * size));
std::vector<float> in(size, 0.0f), out = in;
for (size_t i = 0; i < size; i++) {
in[i] = i + 1;
}
HIP_CHECK(hipMemset(d_out, 0, sizeof(float) * size));
HIP_CHECK(
hipMemcpy(d_in, in.data(), sizeof(float) * size, hipMemcpyHostToDevice));
wg_split_barrier<<<1, size>>>(d_out, d_in);
HIP_CHECK(hipMemcpy(out.data(), d_out, sizeof(float) * size,
hipMemcpyDeviceToHost));
HIP_CHECK(hipFree(d_out));
HIP_CHECK(hipFree(d_in));
for (size_t i = 0; i < size; i++) {
INFO("Index: " << i << " in: " << in[i] << " out: " << out[i]);
REQUIRE((in[i] * 3.0f) == Catch::Approx(out[i]));
}
}
static __global__ void grid_split_barrier(int *data, int *result, int N) {
namespace cg = cooperative_groups;
cg::grid_group grid = cg::this_grid();
int gid = blockIdx.x * blockDim.x + threadIdx.x;
auto tok = grid.barrier_arrive();
if (gid < N) {
data[gid] = gid + 1;
}
grid.barrier_wait(std::move(tok));
if (grid.thread_rank() == 0) {
int sum = 0;
for (int i = 0; i < N; i++)
sum += data[i];
*result = sum;
}
}
TEST_CASE("Unit_coop_grids_split_barrier") {
hipDeviceProp_t prop;
HIP_CHECK(hipGetDeviceProperties(&prop, 0));
if (prop.cooperativeLaunch != 0) {
int N = 1024;
const int threads = 128;
const int blocks = (N + threads - 1) / threads;
int *d_in, *d_out;
HIP_CHECK(hipMalloc(&d_in, N * sizeof(int)));
HIP_CHECK(hipMalloc(&d_out, sizeof(int)));
void *args[] = {&d_in, &d_out, &N};
dim3 grid(blocks);
dim3 block(threads);
HIP_CHECK(hipLaunchCooperativeKernel((void *)grid_split_barrier, grid,
block, args, 0, 0));
HIP_CHECK(hipDeviceSynchronize());
int out = 0;
HIP_CHECK(hipMemcpy(&out, d_out, sizeof(int), hipMemcpyDeviceToHost));
HIP_CHECK(hipFree(d_in));
HIP_CHECK(hipFree(d_out));
REQUIRE(out == ((N * (N + 1)) / 2));
}
}
@@ -7,6 +7,7 @@ set(TEST_SRC
hipDrvGetErrorString.cc hipDrvGetErrorString.cc
hipGetLastError.cc hipGetLastError.cc
hipPeekAtLastError.cc hipPeekAtLastError.cc
hipDynamicLogging.cc
) )
if(UNIX) if(UNIX)
@@ -0,0 +1,134 @@
/*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include <iostream>
#include <sstream>
#include <streambuf>
#include <string>
#include <fstream>
#include <fcntl.h>
#ifdef _WIN32
#include <windows.h>
#include <io.h>
#include <stdio.h>
#include <stdlib.h>
#define dup _dup
#define dup2 _dup2
#define fd_close _close
#define unlink _unlink
#define STDERR_FD _fileno(stderr)
#define OPEN_FLAGS (_O_WRONLY | _O_CREAT | _O_TRUNC)
#define OPEN_MODE (_S_IREAD | _S_IWRITE)
#define open _open
#else
#include <unistd.h>
#define fd_close close
#define STDERR_FD STDERR_FILENO
#define OPEN_FLAGS (O_WRONLY | O_CREAT | O_TRUNC)
#define OPEN_MODE 0644
#endif
// Class to capture all stderr output (HIP logging uses stderr)
class OutCapture {
private:
std::stringstream captured_stream_;
std::streambuf* cerr_backup_;
int stderr_backup_;
std::string temp_file_;
static std::string getTempFilePath() {
#ifdef _WIN32
char temp_path[MAX_PATH];
if (GetTempPathA(MAX_PATH, temp_path)) {
return std::string(temp_path) + "hip_stderr_capture.txt";
}
// Fallback to current directory
return "hip_stderr_capture.txt";
#else
return "/tmp/hip_stderr_capture.txt";
#endif
}
public:
OutCapture() : temp_file_(getTempFilePath()) {
// Backup original cerr stream buffer (HIP logging uses stderr)
cerr_backup_ = std::cerr.rdbuf();
// Backup original stderr file descriptor
stderr_backup_ = dup(STDERR_FD);
}
void startCapture() {
// Clear any previous content
captured_stream_.str("");
captured_stream_.clear();
// Redirect std::cerr to our stringstream
std::cerr.rdbuf(captured_stream_.rdbuf());
// Redirect stderr file descriptor to temp file (for fprintf to stderr)
int temp_fd = open(temp_file_.c_str(), OPEN_FLAGS, OPEN_MODE);
if (temp_fd != -1) {
dup2(temp_fd, STDERR_FD);
fd_close(temp_fd);
}
}
std::string stopCapture() {
// Restore original cerr stream
std::cerr.rdbuf(cerr_backup_);
// Restore original stderr file descriptor
dup2(stderr_backup_, STDERR_FD);
// Read from temp file (captures fprintf(stderr) output from HIP logging)
std::ifstream temp_file(temp_file_);
std::string file_content;
if (temp_file.is_open()) {
std::string line;
while (std::getline(temp_file, line)) {
file_content += line + "\n";
}
temp_file.close();
}
// Combine both captures: C++ streams and file descriptor output
std::string stream_content = captured_stream_.str();
std::string total_output = stream_content + file_content;
// Clean up temp file
unlink(temp_file_.c_str());
return total_output;
}
~OutCapture() {
// Ensure everything is restored
std::cerr.rdbuf(cerr_backup_);
dup2(stderr_backup_, STDERR_FD);
fd_close(stderr_backup_);
unlink(temp_file_.c_str());
}
};
@@ -0,0 +1,156 @@
/*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
#include "OutCapture.hh"
/**
* @addtogroup hipDynamicLogging hipDynamicLogging
* @{
* @ingroup ErrorTest
* `hipExtSetLoggingParams(size_t log_level, size_t log_size, size_t log_mask)` -
* Sets logging parameters for HIP runtime.
* `hipExtEnableLogging()` -
* Enables HIP runtime logging.
* `hipExtDisableLogging()` -
* Disables HIP runtime logging.
*/
static bool hipDynamicLoggingTest() {
// Create output capture instance
OutCapture capture;
capture.startCapture();
// Set Logging params
HIP_CHECK(hipExtSetLoggingParams(4, 0, -1));
// Logging is disabled here - allocate memory
int* dptr = nullptr;
HIP_CHECK(hipMalloc(&dptr, sizeof(int)));
// Stop capture after hipMalloc and check no output (logging disabled)
std::string malloc_output = capture.stopCapture();
if (malloc_output.size() != 0) {
INFO("Unexpected logging output during hipMalloc (logging should be disabled): " << malloc_output);
return false;
}
// Start capture before enabling logging
capture.startCapture();
// Enable logging and do memset
HIP_CHECK(hipExtEnableLogging());
HIP_CHECK(hipMemset(dptr, 0x00, sizeof(int)));
// Disable logging
HIP_CHECK(hipExtDisableLogging());
// Stop capture after disabling logging and check for output
std::string logging_output = capture.stopCapture();
if (logging_output.size() == 0) {
INFO("Expected logging output during enabled logging period, but got none");
return false;
}
// Clean up
HIP_CHECK(hipFree(dptr));
INFO("Successfully captured HIP logging output (" << logging_output.size() << " bytes)");
INFO("Logging output: " << logging_output);
return true;
}
/**
* Test Description
* ------------------------
* - Validates that HIP dynamic logging works correctly:
* 1. No output when logging is disabled
* 2. Logging output is captured when logging is enabled
* 3. hipMemset operation produces logging output during enabled period
* Test source
* ------------------------
* - unit/errorHandling/hipDynamicLogging.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.6
*/
TEST_CASE("Unit_hipDynamicLogging_Positive_Basic") {
int numDevices = 0;
HIP_CHECK(hipGetDeviceCount(&numDevices));
if (numDevices <= 0) {
HipTest::HIP_SKIP_TEST("Skipping hipDynamicLogging test - no devices available");
return;
}
REQUIRE(hipDynamicLoggingTest() == true);
}
/**
* Test Description
* ------------------------
* - Validates that hipExtSetLoggingParams sets logging parameters correctly
* and that logging can be enabled/disabled multiple times
* Test source
* ------------------------
* - unit/errorHandling/hipDynamicLogging.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.6
*/
TEST_CASE("Unit_hipDynamicLogging_Positive_MultipleEnableDisable") {
int numDevices = 0;
HIP_CHECK(hipGetDeviceCount(&numDevices));
if (numDevices <= 0) {
HipTest::HIP_SKIP_TEST("Skipping hipDynamicLogging test - no devices available");
return;
}
// Test multiple enable/disable cycles
OutCapture capture;
int* dptr = nullptr;
HIP_CHECK(hipMalloc(&dptr, sizeof(int)));
// Set different logging parameters
HIP_CHECK(hipExtSetLoggingParams(3, 0, -1));
for (int i = 0; i < 3; ++i) {
// Start capture and enable logging
capture.startCapture();
HIP_CHECK(hipExtEnableLogging());
HIP_CHECK(hipMemset(dptr, 0x42, sizeof(int)));
HIP_CHECK(hipExtDisableLogging());
// Check that we captured some output
std::string output = capture.stopCapture();
REQUIRE(output.size() > 0);
}
HIP_CHECK(hipFree(dptr));
}
/**
* End doxygen group ErrorTest.
* @}
*/
@@ -45,7 +45,7 @@ MATH_UNARY_HP_KERNEL_DEF(hcos);
* - HIP_VERSION >= 5.2 * - HIP_VERSION >= 5.2
*/ */
MATH_UNARY_HP_TEST_DEF_IMPL(hcos, static_cast<float (*)(float)>(std::cos), MATH_UNARY_HP_TEST_DEF_IMPL(hcos, static_cast<float (*)(float)>(std::cos),
ULPValidatorBuilderFactory<float>(2)); ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(h2cos); MATH_UNARY_HP_KERNEL_DEF(h2cos);
@@ -63,7 +63,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2cos);
* - HIP_VERSION >= 5.2 * - HIP_VERSION >= 5.2
*/ */
MATH_UNARY_HP_TEST_DEF_IMPL(h2cos, static_cast<float (*)(float)>(std::cos), MATH_UNARY_HP_TEST_DEF_IMPL(h2cos, static_cast<float (*)(float)>(std::cos),
ULPValidatorBuilderFactory<float>(2)); ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(hsin); MATH_UNARY_HP_KERNEL_DEF(hsin);
@@ -82,7 +82,7 @@ MATH_UNARY_HP_KERNEL_DEF(hsin);
* - HIP_VERSION >= 5.2 * - HIP_VERSION >= 5.2
*/ */
MATH_UNARY_HP_TEST_DEF_IMPL(hsin, static_cast<float (*)(float)>(std::sin), MATH_UNARY_HP_TEST_DEF_IMPL(hsin, static_cast<float (*)(float)>(std::sin),
ULPValidatorBuilderFactory<float>(2)); ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(h2sin); MATH_UNARY_HP_KERNEL_DEF(h2sin);
@@ -100,7 +100,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2sin);
* - HIP_VERSION >= 5.2 * - HIP_VERSION >= 5.2
*/ */
MATH_UNARY_HP_TEST_DEF_IMPL(h2sin, static_cast<float (*)(float)>(std::sin), MATH_UNARY_HP_TEST_DEF_IMPL(h2sin, static_cast<float (*)(float)>(std::sin),
ULPValidatorBuilderFactory<float>(2)); ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(hexp); MATH_UNARY_HP_KERNEL_DEF(hexp);
@@ -119,7 +119,7 @@ MATH_UNARY_HP_KERNEL_DEF(hexp);
* - HIP_VERSION >= 5.2 * - HIP_VERSION >= 5.2
*/ */
MATH_UNARY_HP_TEST_DEF_IMPL(hexp, static_cast<float (*)(float)>(std::exp), MATH_UNARY_HP_TEST_DEF_IMPL(hexp, static_cast<float (*)(float)>(std::exp),
ULPValidatorBuilderFactory<float>(2)); ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(h2exp); MATH_UNARY_HP_KERNEL_DEF(h2exp);
@@ -137,7 +137,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2exp);
* - HIP_VERSION >= 5.2 * - HIP_VERSION >= 5.2
*/ */
MATH_UNARY_HP_TEST_DEF_IMPL(h2exp, static_cast<float (*)(float)>(std::exp), MATH_UNARY_HP_TEST_DEF_IMPL(h2exp, static_cast<float (*)(float)>(std::exp),
ULPValidatorBuilderFactory<float>(2)); ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(hexp10); MATH_UNARY_HP_KERNEL_DEF(hexp10);
@@ -156,7 +156,7 @@ MATH_UNARY_HP_KERNEL_DEF(hexp10);
* - HIP_VERSION >= 5.2 * - HIP_VERSION >= 5.2
*/ */
MATH_UNARY_HP_TEST_DEF_IMPL(hexp10, static_cast<float (*)(float)>(exp10f), MATH_UNARY_HP_TEST_DEF_IMPL(hexp10, static_cast<float (*)(float)>(exp10f),
ULPValidatorBuilderFactory<float>(2)); ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(h2exp10); MATH_UNARY_HP_KERNEL_DEF(h2exp10);
@@ -174,7 +174,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2exp10);
* - HIP_VERSION >= 5.2 * - HIP_VERSION >= 5.2
*/ */
MATH_UNARY_HP_TEST_DEF_IMPL(h2exp10, static_cast<float (*)(float)>(exp10f), MATH_UNARY_HP_TEST_DEF_IMPL(h2exp10, static_cast<float (*)(float)>(exp10f),
ULPValidatorBuilderFactory<float>(2)); ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(hexp2); MATH_UNARY_HP_KERNEL_DEF(hexp2);
@@ -193,7 +193,7 @@ MATH_UNARY_HP_KERNEL_DEF(hexp2);
* - HIP_VERSION >= 5.2 * - HIP_VERSION >= 5.2
*/ */
MATH_UNARY_HP_TEST_DEF_IMPL(hexp2, static_cast<float (*)(float)>(std::exp2), MATH_UNARY_HP_TEST_DEF_IMPL(hexp2, static_cast<float (*)(float)>(std::exp2),
ULPValidatorBuilderFactory<float>(2)); ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(h2exp2); MATH_UNARY_HP_KERNEL_DEF(h2exp2);
@@ -211,7 +211,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2exp2);
* - HIP_VERSION >= 5.2 * - HIP_VERSION >= 5.2
*/ */
MATH_UNARY_HP_TEST_DEF_IMPL(h2exp2, static_cast<float (*)(float)>(std::exp2), MATH_UNARY_HP_TEST_DEF_IMPL(h2exp2, static_cast<float (*)(float)>(std::exp2),
ULPValidatorBuilderFactory<float>(2)); ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(hlog); MATH_UNARY_HP_KERNEL_DEF(hlog);
@@ -230,7 +230,7 @@ MATH_UNARY_HP_KERNEL_DEF(hlog);
* - HIP_VERSION >= 5.2 * - HIP_VERSION >= 5.2
*/ */
MATH_UNARY_HP_TEST_DEF_IMPL(hlog, static_cast<float (*)(float)>(std::log), MATH_UNARY_HP_TEST_DEF_IMPL(hlog, static_cast<float (*)(float)>(std::log),
ULPValidatorBuilderFactory<float>(1)); ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(h2log); MATH_UNARY_HP_KERNEL_DEF(h2log);
@@ -248,7 +248,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2log);
* - HIP_VERSION >= 5.2 * - HIP_VERSION >= 5.2
*/ */
MATH_UNARY_HP_TEST_DEF_IMPL(h2log, static_cast<float (*)(float)>(std::log), MATH_UNARY_HP_TEST_DEF_IMPL(h2log, static_cast<float (*)(float)>(std::log),
ULPValidatorBuilderFactory<float>(1)); ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(hlog10); MATH_UNARY_HP_KERNEL_DEF(hlog10);
@@ -267,7 +267,7 @@ MATH_UNARY_HP_KERNEL_DEF(hlog10);
* - HIP_VERSION >= 5.2 * - HIP_VERSION >= 5.2
*/ */
MATH_UNARY_HP_TEST_DEF_IMPL(hlog10, static_cast<float (*)(float)>(std::log10), MATH_UNARY_HP_TEST_DEF_IMPL(hlog10, static_cast<float (*)(float)>(std::log10),
ULPValidatorBuilderFactory<float>(2)); ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(h2log10); MATH_UNARY_HP_KERNEL_DEF(h2log10);
@@ -285,7 +285,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2log10);
* - HIP_VERSION >= 5.2 * - HIP_VERSION >= 5.2
*/ */
MATH_UNARY_HP_TEST_DEF_IMPL(h2log10, static_cast<float (*)(float)>(std::log10), MATH_UNARY_HP_TEST_DEF_IMPL(h2log10, static_cast<float (*)(float)>(std::log10),
ULPValidatorBuilderFactory<float>(2)); ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(hlog2); MATH_UNARY_HP_KERNEL_DEF(hlog2);
@@ -304,7 +304,7 @@ MATH_UNARY_HP_KERNEL_DEF(hlog2);
* - HIP_VERSION >= 5.2 * - HIP_VERSION >= 5.2
*/ */
MATH_UNARY_HP_TEST_DEF_IMPL(hlog2, static_cast<float (*)(float)>(std::log2), MATH_UNARY_HP_TEST_DEF_IMPL(hlog2, static_cast<float (*)(float)>(std::log2),
ULPValidatorBuilderFactory<float>(1)); ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(h2log2); MATH_UNARY_HP_KERNEL_DEF(h2log2);
@@ -322,7 +322,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2log2);
* - HIP_VERSION >= 5.2 * - HIP_VERSION >= 5.2
*/ */
MATH_UNARY_HP_TEST_DEF_IMPL(h2log2, static_cast<float (*)(float)>(std::log2), MATH_UNARY_HP_TEST_DEF_IMPL(h2log2, static_cast<float (*)(float)>(std::log2),
ULPValidatorBuilderFactory<float>(1)); ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(hsqrt); MATH_UNARY_HP_KERNEL_DEF(hsqrt);
@@ -341,7 +341,7 @@ MATH_UNARY_HP_KERNEL_DEF(hsqrt);
* - HIP_VERSION >= 5.2 * - HIP_VERSION >= 5.2
*/ */
MATH_UNARY_HP_TEST_DEF_IMPL(hsqrt, static_cast<float (*)(float)>(std::sqrt), MATH_UNARY_HP_TEST_DEF_IMPL(hsqrt, static_cast<float (*)(float)>(std::sqrt),
ULPValidatorBuilderFactory<float>(1)); ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(h2sqrt); MATH_UNARY_HP_KERNEL_DEF(h2sqrt);
@@ -359,7 +359,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2sqrt);
* - HIP_VERSION >= 5.2 * - HIP_VERSION >= 5.2
*/ */
MATH_UNARY_HP_TEST_DEF_IMPL(h2sqrt, static_cast<float (*)(float)>(std::sqrt), MATH_UNARY_HP_TEST_DEF_IMPL(h2sqrt, static_cast<float (*)(float)>(std::sqrt),
ULPValidatorBuilderFactory<float>(1)); ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(hceil); MATH_UNARY_HP_KERNEL_DEF(hceil);
@@ -187,7 +187,9 @@ template <typename T, typename... Ts> class MathTest {
std::stringstream ss; std::stringstream ss;
ss << "Input value(s): " << std::scientific ss << "Input value(s): " << std::scientific
<< std::setprecision(std::numeric_limits<T>::max_digits10 - 1); << std::setprecision(std::numeric_limits<T>::max_digits10 - 1);
((ss << " " << args), ...) << "\n" << actual_val << " "; ((ss << " " << args), ...) << "\n"
<< "Output value: " << actual_val << "\n"
<< "Condition failed: ";
return ss.str(); return ss.str();
} }
+121 -1
Visa fil
@@ -25,6 +25,12 @@ THE SOFTWARE.
#include <catch2/catch_all.hpp> #include <catch2/catch_all.hpp>
#include <catch2/matchers/catch_matchers_floating_point.hpp> #include <catch2/matchers/catch_matchers_floating_point.hpp>
#include <cstdint>
#include <cstring>
#include <iomanip>
#include "Float16.hh"
// Define a new MatcherBase class with a public 'describe' member function because // Define a new MatcherBase class with a public 'describe' member function because
// Catch::MatcherBase::describe is protected and thus can't be used via a pointer to // Catch::MatcherBase::describe is protected and thus can't be used via a pointer to
// Catch::MatcherBase. // Catch::MatcherBase.
@@ -61,6 +67,113 @@ template <typename T, typename Matcher> class ValidatorBase : public MatcherBase
bool nan = false; bool nan = false;
}; };
struct Float16WithinUlpsMatcher : MatcherBase<Float16> {
Float16WithinUlpsMatcher(Float16 target, uint64_t ulps) : m_target(target), m_ulps(ulps) {}
bool match(Float16 const& matchee) const override {
// Comparison with NaN should always be false.
// This way we can rule it out before getting into the ugly details
if (__hisnan(matchee) || __hisnan(m_target)) {
return false;
}
auto value_bits = convertFloat16toInt16(matchee);
auto target_bits = convertFloat16toInt16(m_target);
// If signs differ, handle the special +0 vs -0 case explicitly.
if ((value_bits < 0) != (target_bits < 0)) {
return matchee == m_target;
}
auto ulp_diff = std::abs(value_bits - target_bits);
return static_cast<uint64_t>(ulp_diff) <= m_ulps;
}
std::string describe() const override {
std::stringstream ret;
ret << "is within " << m_ulps << " ULPs of ";
write(ret, m_target);
ret << 'f';
ret << " ([";
write(ret, step(m_target, -FLOAT16_MAX, m_ulps));
ret << ", ";
write(ret, step(m_target, FLOAT16_MAX, m_ulps));
ret << "])";
return ret.str();
}
private:
Float16 getNextAfter(Float16 from, Float16 direction) const {
constexpr int16_t signbit_float16 = 0x8000;
// Encode inputs as 16-bit integers
const int16_t from_bits = convertFloat16toInt16(from);
const int16_t direction_bits = convertFloat16toInt16(direction);
// Special cases
if (from_bits == direction_bits) return direction_bits;
if (std::abs(from_bits) == static_cast<int16_t>(0) &&
std::abs(direction_bits) == static_cast<int16_t>(0))
return direction;
// Makes integer comparisons reflect numeric ordering across sign.
const int16_t from_ordered = (from_bits < 0) ? signbit_float16 - from_bits : from_bits;
const int16_t direction_ordered =
(direction_bits < 0) ? signbit_float16 - direction_bits : direction_bits;
// Decide whether to move up or down by one ULP
const int16_t step = (from_ordered < direction_ordered) ? 1 : -1;
// Take one step
const int16_t after_step_ordered = from_ordered + step;
// Map back from ordered space to raw Float16 bits.
int16_t next_bits =
(after_step_ordered < 0) ? signbit_float16 - after_step_ordered : after_step_ordered;
// Handle boundary behavior for the most-negative edge case.
if (from_ordered == -1 && (from_ordered < direction_ordered)) {
next_bits = signbit_float16;
}
return convertInt16toFloat16(next_bits);
}
Float16 step(Float16 start, Float16 direction, uint64_t steps) const {
Float16 result = start;
for (uint64_t i = 0; i < steps; ++i) {
result = getNextAfter(result, direction);
}
return result;
}
void write(std::ostream& out, Float16 num) const {
const uint32_t float16_max_digits = 5;
out << std::scientific << std::setprecision(float16_max_digits) << num;
}
static Float16 convertInt16toFloat16(int16_t d) {
Float16 i;
std::memcpy(&i, &d, sizeof(int16_t));
return i;
}
static int16_t convertFloat16toInt16(Float16 d) {
uint16_t i;
std::memcpy(&i, &d, sizeof(Float16));
return i;
}
Float16 m_target;
uint64_t m_ulps;
};
template <typename T> auto ULPValidatorBuilderFactory(int64_t ulps) { template <typename T> auto ULPValidatorBuilderFactory(int64_t ulps) {
return [=](T target, auto&&...) { return [=](T target, auto&&...) {
return std::make_unique<ValidatorBase<T, Catch::Matchers::WithinUlpsMatcher>>( return std::make_unique<ValidatorBase<T, Catch::Matchers::WithinUlpsMatcher>>(
@@ -68,6 +181,13 @@ template <typename T> auto ULPValidatorBuilderFactory(int64_t ulps) {
}; };
}; };
template <> inline auto ULPValidatorBuilderFactory<Float16>(int64_t ulps) {
return [=](Float16 target, auto&&...) {
return std::make_unique<ValidatorBase<Float16, Float16WithinUlpsMatcher>>(
target, Float16WithinUlpsMatcher(target, ulps));
};
};
template <typename T> auto AbsValidatorBuilderFactory(double margin) { template <typename T> auto AbsValidatorBuilderFactory(double margin) {
return [=](T target, auto&&...) { return [=](T target, auto&&...) {
return std::make_unique<ValidatorBase<T, Catch::Matchers::WithinAbsMatcher>>( return std::make_unique<ValidatorBase<T, Catch::Matchers::WithinAbsMatcher>>(
@@ -96,7 +216,7 @@ template <typename T> class EqValidator : public MatcherBase<T> {
std::string describe() const override { std::string describe() const override {
std::stringstream ss; std::stringstream ss;
ss << " is not equal to " << target_; ss << "is equal to " << target_;
return ss.str(); return ss.str();
} }
@@ -177,8 +177,8 @@ static bool checkhipMemPoolTrimTo(hipStream_t stream, int N, int dev = 0) {
testObj.transferFromMempool(stream); testObj.transferFromMempool(stream);
testObj.freeDevBuf(stream); testObj.freeDevBuf(stream);
// verify and validate // verify and validate
REQUIRE(true == testObj.validateResult());
HIP_CHECK(hipStreamSynchronize(stream)); HIP_CHECK(hipStreamSynchronize(stream));
REQUIRE(true == testObj.validateResult());
} }
HIP_CHECK(hipMemPoolDestroy(mem_pool)); HIP_CHECK(hipMemPoolDestroy(mem_pool));
return true; return true;
@@ -494,7 +494,6 @@ HIP doesn't support the following CUDA functions/operators in ``cooperative_grou
* ``synchronize`` * ``synchronize``
* ``memcpy_async`` * ``memcpy_async``
* ``wait`` and ``wait_prior`` * ``wait`` and ``wait_prior``
* ``barrier_arrive`` and ``barrier_wait``
* ``invoke_one`` and ``invoke_one_broadcast`` * ``invoke_one`` and ``invoke_one_broadcast``
* ``reduce`` * ``reduce``
* ``reduce_update_async`` and ``reduce_store_async`` * ``reduce_update_async`` and ``reduce_store_async``
+39
Visa fil
@@ -9615,6 +9615,45 @@ hipError_t hipDestroySurfaceObject(hipSurfaceObject_t surfaceObject);
/** /**
* @} * @}
*/ */
/**
* @brief Enable HIP runtime logging.
*
* This function enables the HIP runtime logging mechanism, allowing diagnostic
* and trace information to be captured during HIP API execution.
*
* @returns #hipSuccess
*
* @see hipExtDisableLogging, hipExtSetLoggingParams
*/
hipError_t hipExtEnableLogging();
/**
* @brief Disable HIP runtime logging.
*
* This function disables the HIP runtime logging mechanism, stopping the capture
* of diagnostic and trace information during HIP API execution.
*
* @returns #hipSuccess
*
* @see hipExtEnableLogging, hipExtSetLoggingParams
*/
hipError_t hipExtDisableLogging();
/**
* @brief Set HIP runtime logging parameters.
*
* This function configures the logging behavior of the HIP runtime, including
* the verbosity level, buffer size, and which components to log.
*
* @param [in] log_level The logging verbosity level. Higher values produce more detailed output.
* @param [in] log_size Reserved for future use. Currently not implemented.
* @param [in] log_mask A bitmask specifying which HIP runtime components to log.
*
* @returns #hipSuccess, #hipErrorInvalidValue
*
* @see hipExtEnableLogging, hipExtDisableLogging
*/
hipError_t hipExtSetLoggingParams(size_t log_level, size_t log_size, size_t log_mask);
#ifdef __cplusplus #ifdef __cplusplus
} /* extern "c" */ } /* extern "c" */
#endif #endif
@@ -2,7 +2,7 @@ default_stages: [pre-commit]
fail_fast: true fail_fast: true
repos: repos:
- repo: https://github.com/pre-commit/pre-commit-hooks - repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0 rev: v6.0.0
hooks: hooks:
- id: check-yaml - id: check-yaml
- id: end-of-file-fixer - id: end-of-file-fixer
@@ -12,7 +12,7 @@ repos:
- repo: https://github.com/astral-sh/ruff-pre-commit - repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version. Check https://github.com/astral-sh/ruff-pre-commit#version-compatibility # Ruff version. Check https://github.com/astral-sh/ruff-pre-commit#version-compatibility
# for the latest ruff version supported by the hook. # for the latest ruff version supported by the hook.
rev: v0.12.12 rev: v0.14.11
hooks: hooks:
- id: ruff-check - id: ruff-check
args: [--fix] args: [--fix]
+4
Visa fil
@@ -49,6 +49,10 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.
* Fix issue where counter collection data was empty when profiling workload which spawn multiple child processes * Fix issue where counter collection data was empty when profiling workload which spawn multiple child processes
* Fix issue where dispatch filtering in a range (e.g. >2) was not working
* Fix redundant warnings for compute/memory partition not found for < MI 300 series GPUs by skipping partition checks
### Removed ### Removed
* Removed "VL1 Lat" metric for AMD Instinct MI300 series GPUs, due to MI300 series not supporting TCP_TCP_LATENCY_sum counter. * Removed "VL1 Lat" metric for AMD Instinct MI300 series GPUs, due to MI300 series not supporting TCP_TCP_LATENCY_sum counter.
+7 -13
Visa fil
@@ -4,14 +4,13 @@
ROCm Compute Profiler is a system performance profiling tool for machine ROCm Compute Profiler is a system performance profiling tool for machine
learning/HPC workloads running on AMD MI GPUs. The tool presently learning/HPC workloads running on AMD MI GPUs. The tool presently
targets usage on MI100, MI200, and MI300 accelerators. targets usage on MI100, MI200, MI300, and MI350 series accelerators.
* For more information on available features, installation steps, and * For more information on available features, installation steps, and
workload profiling and analysis, please refer to the online workload profiling and analysis, please refer to the online
[documentation](https://rocm.docs.amd.com/projects/rocprofiler-compute/en/latest/). [documentation](https://rocm.docs.amd.com/projects/rocprofiler-compute/en/latest/).
* ROCm Compute Profiler is an AMD open source research project and is not supported * ROCm Compute Profiler is an AMD open source tool that is part of the ROCm software stack. We welcome contributions and
as part of the ROCm software stack. We welcome contributions and
feedback from the community. Please see the feedback from the community. Please see the
[CONTRIBUTING.md](CONTRIBUTING.md) file for additional details on our [CONTRIBUTING.md](CONTRIBUTING.md) file for additional details on our
contribution process. contribution process.
@@ -39,8 +38,8 @@ python3 -m pip install -r requirements.txt
## Testing ## Testing
Populate the <usename> variable in `docker/docker-compose.customrocmtest.yml`. Populate the <username> variable in `docker/docker-compose.customrocmtest.yml`.
Populate the <rocm_build_image> variable in `docker/Dockerfile.customrocmtest` based on latest ROCm CI build information. Populate the <tarball_name> variable in `docker/Dockerfile.customrocmtest` based on latest TheRock nightly build information.
To quickly get the environment (bash shell) for building and testing, run the following commands: To quickly get the environment (bash shell) for building and testing, run the following commands:
* `cd docker` * `cd docker`
@@ -115,7 +114,7 @@ This software can be cited using a Zenodo
style reference is provided below for convenience: style reference is provided below for convenience:
``` ```
@software{xiaomin_lu_2022_7314631 @misc{xiaomin_lu_2022_7314631
author = {Xiaomin Lu and author = {Xiaomin Lu and
Cole Ramos and Cole Ramos and
Fei Zheng and Fei Zheng and
@@ -124,12 +123,7 @@ style reference is provided below for convenience:
Keith Lowery and Keith Lowery and
Nicholas Curtis and Nicholas Curtis and
Cristian Di Pietrantonio}, Cristian Di Pietrantonio},
title = {ROCm/rocprofiler-compute: v3.1.0 (12 February 2025)}, title = {rocprofiler-compute},
month = February, url = {https://github.com/ROCm/rocm-systems/blob/develop/projects/rocprofiler-compute}
year = 2025,
publisher = {Zenodo},
version = {v3.1.0},
doi = {10.5281/zenodo.7314631},
url = {https://doi.org/10.5281/zenodo.7314631}
} }
``` ```
@@ -1,32 +1,52 @@
# Use a base image # Use a base image
FROM <rocm_build_image> FROM ubuntu:22.04
# Set the working directory # Install curl first (needed for ROCm download)
WORKDIR /app RUN apt-get update && apt-get install -y curl
# Define the tarball name as a variable
# Check https://therock-nightly-tarball.s3.amazonaws.com/index.html for latest builds
# Use therock-dist-linux-gfx<arch>-dcgpu-<rocm-version>.tar.gz naming convention
ARG TARBALL_NAME=<tarball_name>
# Install ROCm from TheRock Nightly build
RUN mkdir -p /rocm && \
curl -fLO https://therock-nightly-tarball.s3.amazonaws.com/${TARBALL_NAME} && \
tar -xf ${TARBALL_NAME} -C /rocm && \
rm ${TARBALL_NAME}
# Set environment variables for ROCm
ENV PATH="/rocm/bin:${PATH}" \
ROCM_PATH="/rocm" \
LD_LIBRARY_PATH="/rocm/lib:/rocm/lib/rocm_sysdeps/lib:${LD_LIBRARY_PATH}" \
HIP_DEVICE_LIB_PATH="/rocm/llvm/amdgcn/bitcode" \
HIP_PLATFORM=amd
# Update package list and install prerequisites # Update package list and install prerequisites
RUN apt-get update && apt-get install -y \ RUN apt-get update && apt-get install -y \
software-properties-common cmake locales git curl \ software-properties-common cmake locales git \
&& add-apt-repository ppa:deadsnakes/ppa \ && add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update && apt-get update
# Allows running git commands in /app
RUN git config --global --add safe.directory /app
# Generate the desired locale # Generate the desired locale
RUN locale-gen en_US.UTF-8 RUN locale-gen en_US.UTF-8
# Install Python 3.10 and pip # Install Python 3.10 and pip
RUN apt-get install -y python3.10 python3.10-venv python3.10-dev python3-pip libsqlite3-dev RUN apt-get install -y python3.10 python3.10-venv python3.10-dev python3-pip libsqlite3-dev
RUN python3.10 -m venv venv RUN python3.10 -m venv /venv
ENV PATH="venv/bin:$PATH" ENV PATH="/venv/bin:$PATH"
RUN python -m pip install --upgrade pip RUN python -m pip install --upgrade pip
# Install any dependencies specified in requirements.txt # Install any rocprofiler-compute dependencies specified in requirements.txt
WORKDIR /app/projects/rocprofiler-compute
COPY projects/rocprofiler-compute/requirements.txt /app/projects/rocprofiler-compute/requirements.txt COPY projects/rocprofiler-compute/requirements.txt /app/projects/rocprofiler-compute/requirements.txt
COPY projects/rocprofiler-compute/requirements-test.txt /app/projects/rocprofiler-compute/requirements-test.txt COPY projects/rocprofiler-compute/requirements-test.txt /app/projects/rocprofiler-compute/requirements-test.txt
RUN python -m pip install -r requirements.txt -r requirements-test.txt RUN python -m pip install -r /app/projects/rocprofiler-compute/requirements.txt -r /app/projects/rocprofiler-compute/requirements-test.txt
# Set the working directory
WORKDIR /app
# Allows running git commands in /app
RUN git config --global --add safe.directory /app
# Run interactive bash shell # Run interactive bash shell
CMD ["/bin/bash"] CMD ["/bin/bash"]
Binary file not shown.

Före

Bredd:  |  Höjd:  |  Storlek: 185 KiB

Efter

Bredd:  |  Höjd:  |  Storlek: 254 KiB

Binary file not shown.

Före

Bredd:  |  Höjd:  |  Storlek: 34 KiB

Efter

Bredd:  |  Höjd:  |  Storlek: 240 KiB

@@ -15,7 +15,7 @@ This section provides an overview of ROCm Compute Profiler's CLI analysis featur
* :ref:`Metric customization <cli-analysis-options>`: Isolate a subset of built-in metrics or build your own profiling configuration. * :ref:`Metric customization <cli-analysis-options>`: Isolate a subset of built-in metrics or build your own profiling configuration.
* :ref:`Filtering <cli-analysis-options>`: Hone in on a particular kernel, GPU ID, or dispatch ID via post-process filtering. * :ref:`Filtering <cli-analysis-options>`: Hone in on a particular kernel, GPU ID, or dispatch ID via post-process filtering.
* :ref:`Per-kernel roofline analysis <per-kernel-roofline>`: Detailed arithmetic intensity and performance analysis for individual kernels. * :ref:`Per-kernel roofline analysis <per-kernel-roofline>`: Detailed arithmetic intensity and performance analysis for individual kernels.
Run ``rocprof-compute analyze -h`` for more details. Run ``rocprof-compute analyze -h`` for more details.
@@ -346,6 +346,7 @@ Show System Speed-of-Light and CS_Busy blocks only
this case, ``1`` is the ID for System Speed-of-Light and ``5.1.0`` the ID for this case, ``1`` is the ID for System Speed-of-Light and ``5.1.0`` the ID for
GPU Busy Cycles metric. GPU Busy Cycles metric.
Filter kernels Filter kernels
First, list the top kernels in your application using `--list-stats`. First, list the top kernels in your application using `--list-stats`.
@@ -534,36 +535,40 @@ Analysis database example
.. code-block:: shell-session .. code-block:: shell-session
$ rocprof-compute analyze --verbose --db test -p workloads/vmem/MI300X_A1 -p workloads/vmem1/MI300X_A1 $ rocprof-compute analyze --verbose --output-name test --output-format db -p workloads/nbody/MI300X_A1 -p workloads/nbody1/MI300X_A1
DEBUG Execution mode = analyze DEBUG Execution mode = analyze
__ _ __ _
_ __ ___ ___ _ __ _ __ ___ / _| ___ ___ _ __ ___ _ __ _ _| |_ ___ _ __ ___ ___ _ __ _ __ ___ / _| ___ ___ _ __ ___ _ __ _ _| |_ ___
| '__/ _ \ / __| '_ \| '__/ _ \| |_ _____ / __/ _ \| '_ ` _ \| '_ \| | | | __/ _ \ | '__/ _ \ / __| '_ \| '__/ _ \| |_ _____ / __/ _ \| '_ ` _ \| '_ \| | | | __/ _ \
| | | (_) | (__| |_) | | | (_) | _|_____| (_| (_) | | | | | | |_) | |_| | || __/ | | | (_) | (__| |_) | | | (_) | _|_____| (_| (_) | | | | | | |_) | |_| | || __/
|_| \___/ \___| .__/|_| \___/|_| \___\___/|_| |_| |_| .__/ \__,_|\__\___| |_| \___/ \___| .__/|_| \___/|_| \___\___/|_| |_| |_| .__/ \__,_|\__\___|
|_| |_| |_| |_|
INFO Analysis mode = db INFO Analysis mode = db
DEBUG [omnisoc init] INFO ed45b0b189
DEBUG [omnisoc init] DEBUG [omnisoc init]
DEBUG [analysis] prepping to do some analysis INFO ed45b0b189
INFO [analysis] deriving rocprofiler-compute metrics... DEBUG [omnisoc init]
WARNING Roofline ceilings not found for /app/projects/rocprofiler-compute/workloads/vmem/MI300X_A1. DEBUG [analysis] prepping to do some analysis
WARNING Roofline ceilings not found for /app/projects/rocprofiler-compute/workloads/vmem1/MI300X_A1. INFO [analysis] deriving rocprofiler-compute metrics...
WARNING PC sampling data not found for /app/projects/rocprofiler-compute/workloads/vmem/MI300X_A1. DEBUG Collected roofline ceilings
WARNING PC sampling data not found for /app/projects/rocprofiler-compute/workloads/vmem1/MI300X_A1. WARNING PC sampling data not found for /app/projects/rocprofiler-compute/workloads/nbody/MI300X_A1.
DEBUG Collected dispatch data WARNING PC sampling data not found for /app/projects/rocprofiler-compute/workloads/nbody1/MI300X_A1.
DEBUG Applied analysis mode filters DEBUG Collected dispatch data
DEBUG Calculated dispatch data DEBUG Applied analysis mode filters
DEBUG Collected metrics data DEBUG Calculated dispatch data
WARNING Failed to evaluate expression for 3.1.39 - Value: to_round((to_avg( DEBUG Collected metrics data
WARNING Failed to evaluate expression for 3.1.39 - Value: to_round((to_avg(
(pmc_df.get("pmc_perf_ACCUM") / pmc_df.get("SQC_ICACHE_REQ")).where((pmc_df.get("SQC_ICACHE_REQ") != 0), None)) * 100), 0) - unsupported operand type(s) for /: 'NoneType' and 'float' (pmc_df.get("pmc_perf_ACCUM") / pmc_df.get("SQC_ICACHE_REQ")).where((pmc_df.get("SQC_ICACHE_REQ") != 0), None)) * 100), 0) - unsupported operand type(s) for /: 'NoneType' and 'float'
WARNING Failed to evaluate expression for 3.1.39 - Value: to_round((to_avg( WARNING Failed to evaluate expression for 3.1.39 - Value: to_round((to_avg(
(pmc_df.get("pmc_perf_ACCUM") / pmc_df.get("SQC_ICACHE_REQ")).where((pmc_df.get("SQC_ICACHE_REQ") != 0), None)) * 100), 0) - unsupported operand type(s) for /: 'NoneType' and 'float' (pmc_df.get("pmc_perf_ACCUM") / pmc_df.get("SQC_ICACHE_REQ")).where((pmc_df.get("SQC_ICACHE_REQ") != 0), None)) * 100), 0) - unsupported operand type(s) for /: 'NoneType' and 'float'
DEBUG Calculated metric values DEBUG Calculated metric values
DEBUG Calculated roofline data points DEBUG Calculated roofline data points
DEBUG [analysis] generating analysis DEBUG [analysis] generating analysis
DEBUG SQLite database initialized with name: test.db DEBUG SQLite database initialized with name: test.db
DEBUG Initialized database: test.db DEBUG Initialized database: test.db
DEBUG Completed writing database INFO ed45b0b189
INFO ed45b0b189
DEBUG Completed writing database
WARNING Created file: test.db
@@ -28,7 +28,9 @@ Launch the standalone GUI analyzer
---------------------------------- ----------------------------------
To launch the ROCm Compute Profiler GUI analyzer, include the ``--gui`` flag with your To launch the ROCm Compute Profiler GUI analyzer, include the ``--gui`` flag with your
desired analysis command. For example: desired analysis command.
For example:
.. code-block:: shell-session .. code-block:: shell-session
@@ -386,26 +386,24 @@ class OmniAnalyze_Base:
sys.exit(0) sys.exit(0)
# Ensure analysis output does not overwrite existing files # Ensure analysis output does not overwrite existing files
if not args.output_name: if args.output_name:
return if not re.match(r"^[A-Za-z0-9_-]+$", args.output_name):
console_error(
"analysis",
"Analysis output file/folder name must "
"contain only alphanumeric characters "
"or underscores (_), hyphens (-).",
)
if not re.match(r"^[A-Za-z0-9_-]+$", args.output_name): path_to_check = args.output_name
console_error( if args.output_format in ("txt", "db"):
"analysis", path_to_check += f".{args.output_format}"
"Analysis output file/folder name must "
"contain only alphanumeric characters "
"or underscores (_), hyphens (-).",
)
path_to_check = args.output_name if Path(path_to_check).exists():
if args.output_format in ("txt", "db"): console_error(
path_to_check += f".{args.output_format}" f"Analysis output file/folder {path_to_check} already exists. "
"Please choose a different name."
if Path(path_to_check).exists(): )
console_error(
f"Analysis output file/folder {path_to_check} already exists. "
"Please choose a different name."
)
# Check if any kernel's counters are missing due to iteration multiplexing # Check if any kernel's counters are missing due to iteration multiplexing
if ( if (
@@ -101,7 +101,9 @@ class db_analysis(OmniAnalyze_Base):
Database.init(db_name) Database.init(db_name)
console_debug(f"Initialized database: {db_name}") console_debug(f"Initialized database: {db_name}")
# Iterate over all workloads
for workload_path in self._runs.keys(): for workload_path in self._runs.keys():
# Add workload
workload_obj = orm.Workload( workload_obj = orm.Workload(
name=workload_path.split("/")[-2], name=workload_path.split("/")[-2],
sub_name=workload_path.split("/")[-1], sub_name=workload_path.split("/")[-1],
@@ -113,38 +115,9 @@ class db_analysis(OmniAnalyze_Base):
) )
Database.get_session().add(workload_obj) Database.get_session().add(workload_obj)
for pc_sample in self._pc_sampling_data_per_workload.get( # Add kernel
workload_path, pd.DataFrame()
).itertuples():
Database.get_session().add(
orm.PCsampling(
source=pc_sample.source_line,
instruction=pc_sample.instruction,
count=pc_sample.count,
kernel_name=pc_sample.kernel_name,
offset=pc_sample.offset,
count_issue=pc_sample.count_issued,
count_stall=pc_sample.count_stalled,
stall_reason=pc_sample.stall_reason,
workload=workload_obj,
)
)
for roofline_data in self._roofline_data_per_workload.get(
workload_path, pd.DataFrame()
).itertuples():
Database.get_session().add(
orm.RooflineData(
kernel_name=roofline_data.kernel_name,
total_flops=roofline_data.total_flops,
l1_cache_data=roofline_data.l1_cache_data,
l2_cache_data=roofline_data.l2_cache_data,
hbm_cache_data=roofline_data.hbm_cache_data,
workload=workload_obj,
)
)
kernel_objs: dict[str, orm.Kernel] = {} kernel_objs: dict[str, orm.Kernel] = {}
for dispatch in self._dispatch_data_per_workload.get( for dispatch in self._dispatch_data_per_workload.get(
workload_path, pd.DataFrame() workload_path, pd.DataFrame()
).itertuples(): ).itertuples():
@@ -167,44 +140,101 @@ class db_analysis(OmniAnalyze_Base):
) )
) )
# Optimize: Pre-group values by (metric_id, kernel_name) for O(1) lookups # Add roofline data points
values_df = self._values_data_per_workload.get( for roofline_data in self._roofline_data_per_workload.get(
workload_path, pd.DataFrame()
)
values_grouped = {}
if not values_df.empty:
for value in values_df.itertuples():
key = (value.metric_id, value.kernel_name)
if key not in values_grouped:
values_grouped[key] = []
values_grouped[key].append(value)
for metric in self._metrics_info_data_per_workload.get(
workload_path, pd.DataFrame() workload_path, pd.DataFrame()
).itertuples(): ).itertuples():
for kernel_name in kernel_objs.keys(): if roofline_data.kernel_name not in kernel_objs:
metric_obj = orm.Metric( console_warning(
name=metric.name, f"Kernel {roofline_data.kernel_name} from roofline data "
metric_id=metric.metric_id, "not found in dispatch data. Skipping roofline entry."
description=metric.description,
unit=metric.unit,
table_name=metric.table_name,
sub_table_name=metric.sub_table_name,
kernel=kernel_objs[kernel_name],
) )
Database.get_session().add(metric_obj) continue
Database.get_session().add(
orm.RooflineData(
total_flops=roofline_data.total_flops,
l1_cache_data=roofline_data.l1_cache_data,
l2_cache_data=roofline_data.l2_cache_data,
hbm_cache_data=roofline_data.hbm_cache_data,
kernel=kernel_objs[roofline_data.kernel_name],
)
)
# Direct lookup instead of iterating through all values # Add pc sampling data
key = (metric.metric_id, kernel_name) for pc_sample in self._pc_sampling_data_per_workload.get(
for value in values_grouped.get(key, []): workload_path, pd.DataFrame()
Database.get_session().add( ).itertuples():
orm.Value( if pc_sample.kernel_name not in kernel_objs:
metric=metric_obj, console_warning(
value_name=value.value_name, f"Kernel {pc_sample.kernel_name} from PC sampling data "
value=value.value, "not found in dispatch data. Skipping PC sampling entry."
) )
continue
Database.get_session().add(
orm.PCsampling(
source=pc_sample.source_line,
instruction=pc_sample.instruction,
count=pc_sample.count,
offset=pc_sample.offset,
count_issue=pc_sample.count_issued,
count_stall=pc_sample.count_stalled,
stall_reason=pc_sample.stall_reason,
kernel=kernel_objs[pc_sample.kernel_name],
)
)
# Add metrics and values - iterate on values, create metrics as needed
metrics_info_dict = {
row.metric_id: row
for row in self._metrics_info_data_per_workload.get(
workload_path, pd.DataFrame()
).itertuples()
}
metric_objs: dict[str, orm.MetricDefinition] = {}
for value in self._values_data_per_workload.get(
workload_path, pd.DataFrame()
).itertuples():
# Check if kernel exists
if value.kernel_name not in kernel_objs:
console_warning(
f"Kernel {value.kernel_name} from values data "
"not found in dispatch data. Skipping metric value."
)
continue
# Create or reuse metric object
if value.metric_id not in metric_objs:
# Fetch metric info
if value.metric_id not in metrics_info_dict:
console_warning(
f"Metric {value.metric_id} from values data "
"not found in metrics info. Skipping metric value."
) )
continue
metric_info = metrics_info_dict[value.metric_id]
metric_objs[value.metric_id] = orm.MetricDefinition(
name=metric_info.name,
metric_id=metric_info.metric_id,
description=metric_info.description,
unit=metric_info.unit,
table_name=metric_info.table_name,
sub_table_name=metric_info.sub_table_name,
workload=workload_obj,
)
Database.get_session().add(metric_objs[value.metric_id])
# Add value
Database.get_session().add(
orm.MetricValue(
metric=metric_objs[value.metric_id],
kernel=kernel_objs[value.kernel_name],
value_name=value.value_name,
value=value.value,
)
)
# Add metadata
version = get_version(rocprof_compute_home) version = get_version(rocprof_compute_home)
Database.get_session().add( Database.get_session().add(
orm.Metadata( orm.Metadata(
@@ -51,7 +51,6 @@ class webui_analysis(OmniAnalyze_Base):
self.app = dash.Dash( self.app = dash.Dash(
__name__, title=PROJECT_NAME, external_stylesheets=[dbc.themes.CYBORG] __name__, title=PROJECT_NAME, external_stylesheets=[dbc.themes.CYBORG]
) )
self.dest_dir = str(Path(args.path[0][0]).absolute().resolve())
self.arch: Optional[str] = None self.arch: Optional[str] = None
self.__hidden_sections = ["Memory Chart"] self.__hidden_sections = ["Memory Chart"]
@@ -90,6 +89,7 @@ class webui_analysis(OmniAnalyze_Base):
kernel_top_df = base_data.dfs[1] kernel_top_df = base_data.dfs[1]
for kernel_id in base_data.filter_kernel_ids: for kernel_id in base_data.filter_kernel_ids:
filt_kernel_names.append(str(kernel_top_df.loc[kernel_id, "Kernel_Name"])) filt_kernel_names.append(str(kernel_top_df.loc[kernel_id, "Kernel_Name"]))
input_filters["kernel"] = filt_kernel_names
# setup app layout # setup app layout
from utils.gui_components.header import get_header from utils.gui_components.header import get_header
@@ -338,6 +338,7 @@ class webui_analysis(OmniAnalyze_Base):
) )
args = self.get_args() args = self.get_args()
self.dest_dir = str(Path(args.path[0][0]).absolute().resolve())
# create 'mega dataframe' # create 'mega dataframe'
self._runs[self.dest_dir].raw_pmc = file_io.create_df_pmc( self._runs[self.dest_dir].raw_pmc = file_io.create_df_pmc(
@@ -531,12 +531,15 @@ class RocProfCompute_Base:
and not args.attach_pid and not args.attach_pid
): ):
# Use native counter collection tool # Use native counter collection tool
# Use lib* glob pattern to handle CMAKE_INSTALL_LIBDIR variations
# (lib, lib64, lib32, etc. depending on distribution)
native_tool_base_path = Path(sys.argv[0]).resolve().parents[2]
native_tool_glob_pattern = (
"lib*/rocprofiler-compute/librocprofiler-compute-tool.so"
)
try: try:
native_tool_path = str( native_tool_path = str(
Path(sys.argv[0]).resolve().parents[2] next(native_tool_base_path.glob(native_tool_glob_pattern))
/ "lib"
/ "rocprofiler-compute"
/ "librocprofiler-compute-tool.so"
) )
except Exception as e: except Exception as e:
console_debug( console_debug(
@@ -552,6 +555,7 @@ class RocProfCompute_Base:
) )
/ "librocprofiler-compute-tool.so" / "librocprofiler-compute-tool.so"
) )
native_tool_cpp_path = Path(__file__).resolve().parents[1] / "lib"
link_libraries = ("rocprofiler-sdk",) link_libraries = ("rocprofiler-sdk",)
build_command = ( build_command = (
# Create shared object # Create shared object
@@ -564,10 +568,10 @@ class RocProfCompute_Base:
# rocprofiler sdk library path # rocprofiler sdk library path
f"-L {str(Path(args.rocprofiler_sdk_tool_path).parent.parent)} " f"-L {str(Path(args.rocprofiler_sdk_tool_path).parent.parent)} "
# native tool source files (tool.cpp and helper.cpp) # native tool source files (tool.cpp and helper.cpp)
f"{str(Path(__file__).parent.parent)}/" f"{native_tool_cpp_path}/"
"lib/rocprofiler_compute_tool.cpp " "rocprofiler_compute_tool.cpp "
f"{str(Path(__file__).parent.parent)}/" f"{native_tool_cpp_path}/"
"lib/helper.cpp " "helper.cpp "
# temporary shared object for native tool # temporary shared object for native tool
f"-o {native_tool_path}" f"-o {native_tool_path}"
) )
@@ -575,7 +579,15 @@ class RocProfCompute_Base:
success, output = capture_subprocess_output(shlex.split(build_command)) success, output = capture_subprocess_output(shlex.split(build_command))
console_debug(f"Build output: {output}") console_debug(f"Build output: {output}")
if not success: if not success:
console_error("Failed to build native counter collection tool.") console_error(
"Failed to use native counter collection tool.\n"
"Could not find pre-built .so file at: "
f"{native_tool_base_path / native_tool_glob_pattern}\n"
"Could not find source .cpp files in folder: "
f"{native_tool_cpp_path}\n"
"Please ensure the native tool library is installed "
"or source files are present."
)
if self.__profiler == "rocprofiler-sdk": if self.__profiler == "rocprofiler-sdk":
options = self.get_profiler_options(native_tool_path=native_tool_path) options = self.get_profiler_options(native_tool_path=native_tool_path)
@@ -92,7 +92,6 @@ class rocprofiler_sdk_profiler(RocProfCompute_Base):
/ "librocprofiler-sdk-rocattach.so" / "librocprofiler-sdk-rocattach.so"
) )
options.update({ options.update({
"ROCPROF_ATTACH_TOOL_LIBRARY": rocprofiler_sdk_tool_path,
"ROCPROF_ATTACH_LIBRARY": rocprofiler_attach_library_path, "ROCPROF_ATTACH_LIBRARY": rocprofiler_attach_library_path,
"ROCPROF_ATTACH_PID": args.attach_pid, "ROCPROF_ATTACH_PID": args.attach_pid,
}) })
@@ -45,7 +45,7 @@ from sqlalchemy.sql import Select
from utils.logger import console_debug, console_error from utils.logger import console_debug, console_error
PREFIX = "compute_" PREFIX = "compute_"
SCHEMA_VERSION = "1.1.0" SCHEMA_VERSION = "1.2.0"
Base = declarative_base() Base = declarative_base()
@@ -63,18 +63,16 @@ class Workload(Base):
# Workload can have multiple kernels # Workload can have multiple kernels
kernels = relationship("Kernel", back_populates="workload") kernels = relationship("Kernel", back_populates="workload")
# Workload can have multiple roofline data points # Workload can have multiple metric definitions
roofline_data_points = relationship("RooflineData", back_populates="workload") metric_definitions = relationship("MetricDefinition", back_populates="workload")
# Workload can have multiple pc_sampling values
pc_sampling_values = relationship("PCsampling", back_populates="workload")
class Metric(Base): class MetricDefinition(Base):
__tablename__ = f"{PREFIX}metric" __tablename__ = f"{PREFIX}metric_definition"
metric_uuid = Column(Integer, primary_key=True) metric_uuid = Column(Integer, primary_key=True)
kernel_uuid = Column( workload_id = Column(
Integer, ForeignKey(f"{PREFIX}kernel.kernel_uuid"), nullable=False Integer, ForeignKey(f"{PREFIX}workload.workload_id"), nullable=False
) )
name = Column(String) # e.g. Wavefronts Num name = Column(String) # e.g. Wavefronts Num
metric_id = Column(String) # e.g. 4.1.3 metric_id = Column(String) # e.g. 4.1.3
@@ -83,27 +81,26 @@ class Metric(Base):
sub_table_name = Column(String) # e.g. Wavefront stats sub_table_name = Column(String) # e.g. Wavefront stats
unit = Column(String) # e.g. Gbps unit = Column(String) # e.g. Gbps
# Metric can have one kernel # Metric can have one workload
kernel = relationship("Kernel", back_populates="metrics") workload = relationship("Workload", back_populates="metric_definitions")
# Metric can have multiple values # Metric can have multiple metric values
values = relationship("Value", back_populates="metric") metric_values = relationship("MetricValue", back_populates="metric")
class RooflineData(Base): class RooflineData(Base):
__tablename__ = f"{PREFIX}roofline_data" __tablename__ = f"{PREFIX}roofline_data"
roofline_uuid = Column(Integer, primary_key=True) roofline_uuid = Column(Integer, primary_key=True)
workload_id = Column( kernel_uuid = Column(
Integer, ForeignKey(f"{PREFIX}workload.workload_id"), nullable=False Integer, ForeignKey(f"{PREFIX}kernel.kernel_uuid"), nullable=False
) )
kernel_name = Column(String)
total_flops = Column(Float) total_flops = Column(Float)
l1_cache_data = Column(Float) l1_cache_data = Column(Float)
l2_cache_data = Column(Float) l2_cache_data = Column(Float)
hbm_cache_data = Column(Float) hbm_cache_data = Column(Float)
# Roofline data point can have one workload # Roofline data point can have one kernel
workload = relationship("Workload", back_populates="roofline_data_points") kernel = relationship("Kernel", back_populates="roofline_data_points")
class Dispatch(Base): class Dispatch(Base):
@@ -135,42 +132,50 @@ class Kernel(Base):
workload = relationship("Workload", back_populates="kernels") workload = relationship("Workload", back_populates="kernels")
# Kernel can have multiple dispatches # Kernel can have multiple dispatches
dispatches = relationship("Dispatch", back_populates="kernel") dispatches = relationship("Dispatch", back_populates="kernel")
# Kernel can have multiple metrics # Kernel can have multiple metric values
metrics = relationship("Metric", back_populates="kernel") metric_values = relationship("MetricValue", back_populates="kernel")
# Kernel can have multiple roofline data points
roofline_data_points = relationship("RooflineData", back_populates="kernel")
# Kernel can have multiple pc_sampling values
pc_sampling_values = relationship("PCsampling", back_populates="kernel")
class PCsampling(Base): class PCsampling(Base):
__tablename__ = f"{PREFIX}pcsampling" __tablename__ = f"{PREFIX}pcsampling"
pc_sampling_uuid = Column(Integer, primary_key=True) pc_sampling_uuid = Column(Integer, primary_key=True)
workload_id = Column( kernel_uuid = Column(
Integer, ForeignKey(f"{PREFIX}workload.workload_id"), nullable=False Integer, ForeignKey(f"{PREFIX}kernel.kernel_uuid"), nullable=False
) )
source = Column(String) source = Column(String)
instruction = Column(String) instruction = Column(String)
count = Column(Integer) count = Column(Integer)
kernel_name = Column(String)
offset = Column(Integer) offset = Column(Integer)
count_issue = Column(Integer) count_issue = Column(Integer)
count_stall = Column(Integer) count_stall = Column(Integer)
stall_reason = Column(JSON) stall_reason = Column(JSON)
# PCsampling can have one workload # PCsampling can have one kernel
workload = relationship("Workload", back_populates="pc_sampling_values") kernel = relationship("Kernel", back_populates="pc_sampling_values")
class Value(Base): class MetricValue(Base):
__tablename__ = f"{PREFIX}value" __tablename__ = f"{PREFIX}metric_value"
value_uuid = Column(Integer, primary_key=True) value_uuid = Column(Integer, primary_key=True)
metric_uuid = Column( metric_uuid = Column(
Integer, ForeignKey(f"{PREFIX}metric.metric_uuid"), nullable=False Integer, ForeignKey(f"{PREFIX}metric_definition.metric_uuid"), nullable=False
)
kernel_uuid = Column(
Integer, ForeignKey(f"{PREFIX}kernel.kernel_uuid"), nullable=False
) )
value_name = Column(String) # e.g. min, max, avg value_name = Column(String) # e.g. min, max, avg
value = Column(Float) # e.g. 123.45 value = Column(Float) # e.g. 123.45
# Value can have one metric # Value can have one metric
metric = relationship("Metric", back_populates="values") metric = relationship("MetricDefinition", back_populates="metric_values")
# Value can have one kernel
kernel = relationship("Kernel", back_populates="metric_values")
class Metadata(Base): class Metadata(Base):
@@ -250,11 +255,20 @@ def get_views() -> list[TextClause]:
views: dict[str, Select[Any]] = { views: dict[str, Select[Any]] = {
"kernel_view": select( "kernel_view": select(
Kernel.kernel_uuid.label("kernel_uuid"),
Kernel.workload_id.label("workload_id"),
Workload.name.label("workload_name"),
Kernel.kernel_name, Kernel.kernel_name,
func.count(Dispatch.dispatch_id).label("dispatch_count"), func.count(Dispatch.dispatch_id).label("dispatch_count"),
func.sum(Dispatch.end_timestamp - Dispatch.start_timestamp).label( func.sum(Dispatch.end_timestamp - Dispatch.start_timestamp).label(
"duration_ns_sum" "duration_ns_sum"
), ),
func.min(Dispatch.end_timestamp - Dispatch.start_timestamp).label(
"duration_ns_min"
),
func.max(Dispatch.end_timestamp - Dispatch.start_timestamp).label(
"duration_ns_max"
),
median_calc.c.duration_ns_median, median_calc.c.duration_ns_median,
func.avg(Dispatch.end_timestamp - Dispatch.start_timestamp).label( func.avg(Dispatch.end_timestamp - Dispatch.start_timestamp).label(
"duration_ns_mean" "duration_ns_mean"
@@ -262,24 +276,31 @@ def get_views() -> list[TextClause]:
) )
.select_from(Dispatch) .select_from(Dispatch)
.join(Kernel, Dispatch.kernel_uuid == Kernel.kernel_uuid) .join(Kernel, Dispatch.kernel_uuid == Kernel.kernel_uuid)
.join(Workload, Kernel.workload_id == Workload.workload_id)
.join(median_calc.subquery(), Kernel.kernel_name == median_calc.c.kernel_name) .join(median_calc.subquery(), Kernel.kernel_name == median_calc.c.kernel_name)
.group_by(Kernel.kernel_name), .group_by(
Kernel.kernel_uuid, Kernel.workload_id, Workload.name, Kernel.kernel_name
),
"metric_view": select( "metric_view": select(
Workload.workload_id.label("workload_id"),
Workload.name.label("workload_name"), Workload.name.label("workload_name"),
Kernel.kernel_uuid.label("kernel_uuid"),
Kernel.kernel_name, Kernel.kernel_name,
Metric.name.label("metric_name"), MetricDefinition.metric_uuid.label("metric_uuid"),
Metric.metric_id, MetricDefinition.name.label("metric_name"),
Metric.description, MetricDefinition.metric_id,
Metric.table_name, MetricDefinition.description,
Metric.sub_table_name, MetricDefinition.table_name,
Metric.unit, MetricDefinition.sub_table_name,
Value.value_name, MetricDefinition.unit,
Value.value, MetricValue.value_uuid.label("value_uuid"),
MetricValue.value_name,
MetricValue.value,
) )
.select_from(Metric) .select_from(MetricDefinition)
.join(Kernel, Metric.kernel_uuid == Kernel.kernel_uuid) .join(Workload, MetricDefinition.workload_id == Workload.workload_id)
.join(Value, Metric.metric_uuid == Value.metric_uuid) .join(MetricValue, MetricDefinition.metric_uuid == MetricValue.metric_uuid)
.join(Workload, Kernel.workload_id == Workload.workload_id), .join(Kernel, MetricValue.kernel_uuid == Kernel.kernel_uuid),
} }
return [ return [
+6 -4
Visa fil
@@ -61,7 +61,7 @@ def multi_bar_chart(
def create_instruction_mix_bar_chart(display_df: pd.DataFrame, df_unit: str) -> px.bar: def create_instruction_mix_bar_chart(display_df: pd.DataFrame, df_unit: str) -> px.bar:
display_df = display_df.copy() display_df = display_df.copy()
display_df["Avg"] = display_df["Avg"].apply(lambda x: int(x) if x != "" else 0) display_df["Avg"] = display_df["Avg"].apply(lambda x: int(x) if x != "N/A" else 0)
return px.bar( return px.bar(
display_df, display_df,
@@ -78,7 +78,7 @@ def create_multi_bar_charts(
display_df: pd.DataFrame, table_id: int, df_unit: str display_df: pd.DataFrame, table_id: int, df_unit: str
) -> list[px.bar]: ) -> list[px.bar]:
display_df = display_df.copy() display_df = display_df.copy()
display_df["Avg"] = display_df["Avg"].apply(lambda x: int(x) if x != "" else 0) display_df["Avg"] = display_df["Avg"].apply(lambda x: int(x) if x != "N/A" else 0)
nested_bar = multi_bar_chart(table_id, display_df) nested_bar = multi_bar_chart(table_id, display_df)
charts = [] charts = []
@@ -103,7 +103,9 @@ def create_multi_bar_charts(
def create_sol_charts(display_df: pd.DataFrame, table_id: int) -> list[px.bar]: def create_sol_charts(display_df: pd.DataFrame, table_id: int) -> list[px.bar]:
display_df = display_df.copy() display_df = display_df.copy()
display_df["Avg"] = display_df["Avg"].apply(lambda x: float(x) if x != "" else 0.0) display_df["Avg"] = display_df["Avg"].apply(
lambda x: float(x) if x != "N/A" else 0.0
)
charts = [] charts = []
@@ -144,7 +146,7 @@ def create_sol_charts(display_df: pd.DataFrame, table_id: int) -> list[px.bar]:
elif table_id == 1101: elif table_id == 1101:
# Special formatting reference 'Pct of Peak' value # Special formatting reference 'Pct of Peak' value
display_df["Pct of Peak"] = display_df["Pct of Peak"].apply( display_df["Pct of Peak"] = display_df["Pct of Peak"].apply(
lambda x: float(x) if x != "" else 0.0 lambda x: float(x) if x != "N/A" else 0.0
) )
charts.append( charts.append(
px.bar( px.bar(
@@ -1290,6 +1290,8 @@ def apply_dispatch_filter(df: pd.DataFrame, workload: schema.Workload) -> pd.Dat
# NB: support ignoring the 1st n dispatched execution by '> n' # NB: support ignoring the 1st n dispatched execution by '> n'
# The better way may be parsing python slice string # The better way may be parsing python slice string
for dispatch_id in workload.filter_dispatch_ids: for dispatch_id in workload.filter_dispatch_ids:
if isinstance(dispatch_id, str) and ">" in dispatch_id:
dispatch_id = re.match(r"\>\s*(\d+)", dispatch_id).group(1)
if int(dispatch_id) >= len(df): # subtract 2 bc of the two header rows if int(dispatch_id) >= len(df): # subtract 2 bc of the two header rows
console_error("analysis", f"{dispatch_id} is an invalid dispatch id.") console_error("analysis", f"{dispatch_id} is an invalid dispatch id.")
@@ -1297,7 +1299,7 @@ def apply_dispatch_filter(df: pd.DataFrame, workload: schema.Workload) -> pd.Dat
isinstance(workload.filter_dispatch_ids[0], str) isinstance(workload.filter_dispatch_ids[0], str)
and ">" in workload.filter_dispatch_ids[0] and ">" in workload.filter_dispatch_ids[0]
): ):
dispatch_match = re.match(r"\> (\d+)", workload.filter_dispatch_ids[0]) dispatch_match = re.match(r"\>\s*(\d+)", workload.filter_dispatch_ids[0])
df = df[ df = df[
df[schema.PMC_PERF_FILE_PREFIX]["Dispatch_ID"] df[schema.PMC_PERF_FILE_PREFIX]["Dispatch_ID"]
> int(dispatch_match.group(1)) > int(dispatch_match.group(1))
+27 -13
Visa fil
@@ -174,15 +174,15 @@ def generate_machine_specs(
########################################## ##########################################
machine_info = extract_machine_info() machine_info = extract_machine_info()
# FIXME: use device
# Load amd-smi data
gpu_info = extract_gpu_info()
########################################## ##########################################
## B. SoC Specs ## B. SoC Specs
########################################## ##########################################
soc_info = extract_soc_info() soc_info = extract_soc_info()
# FIXME: use device
# Load amd-smi data
gpu_info = extract_gpu_info(gpu_arch=soc_info["gpu_arch"])
# Combine all specifications # Combine all specifications
with amdsmi_ctx(): with amdsmi_ctx():
specs = MachineSpecs( specs = MachineSpecs(
@@ -269,7 +269,16 @@ def extract_machine_info() -> dict[str, Any]:
@demarcate @demarcate
def extract_gpu_info() -> dict[str, Any]: def extract_gpu_info(gpu_arch: Optional[str]) -> dict[str, Any]:
# Partition is only supported on >= MI 300 series
# (gpu_arch should be gfx940 or higher for MI300+)
is_partition_supported = False
if gpu_arch and gpu_arch.startswith("gfx") and len(gpu_arch) >= 6:
try:
is_partition_supported = int(gpu_arch[3:6], 16) >= 0x940
except ValueError:
pass # Invalid hex string, keep is_partition_supported as False
result: dict[str, Optional[str]] = { result: dict[str, Optional[str]] = {
"vbios": None, "vbios": None,
"compute_partition": None, "compute_partition": None,
@@ -278,17 +287,22 @@ def extract_gpu_info() -> dict[str, Any]:
with amdsmi_ctx(): with amdsmi_ctx():
result["vbios"] = get_gpu_vbios_part_number() result["vbios"] = get_gpu_vbios_part_number()
result["compute_partition"] = get_gpu_compute_partition() if is_partition_supported:
result["memory_partition"] = get_gpu_memory_partition() result["compute_partition"] = get_gpu_compute_partition()
result["memory_partition"] = get_gpu_memory_partition()
else:
result["compute_partition"] = "N/A"
result["memory_partition"] = "N/A"
# Apply defaults and warnings # Apply defaults and warnings
if result["compute_partition"] == "N/A" or not result["compute_partition"]: if is_partition_supported:
console_warning("Cannot detect accelerator partition from amd-smi.") if result["compute_partition"] == "N/A" or not result["compute_partition"]:
console_warning("Applying default accelerator partition: SPX") console_warning("Cannot detect accelerator partition from amd-smi.")
result["compute_partition"] = "SPX" console_warning("Applying default accelerator partition: SPX")
result["compute_partition"] = "SPX"
if result["memory_partition"] == "N/A" or not result["memory_partition"]: if result["memory_partition"] == "N/A" or not result["memory_partition"]:
console_warning("Cannot detect memory partition from amd-smi.") console_warning("Cannot detect memory partition from amd-smi.")
console_debug( console_debug(
f"vbios is {result['vbios']}, compute partition is " f"vbios is {result['vbios']}, compute partition is "
+96 -43
Visa fil
@@ -235,6 +235,101 @@ def detect_rocprof(args: argparse.Namespace) -> str:
return rocprof_cmd return rocprof_cmd
def perform_attach_detach(new_env: dict[str, str], options: dict[str, Any]) -> None:
@contextmanager
def temporary_env(env_vars: dict[str, str]) -> Generator[None, None, None]:
"""
Temporarily change the environment variable of this application.
"""
original_env = os.environ.copy()
os.environ.update({k: str(v) for k, v in env_vars.items()})
try:
yield
finally:
os.environ.clear()
os.environ.update(original_env)
with temporary_env(new_env):
libname = options["ROCPROF_ATTACH_LIBRARY"]
try:
c_lib = ctypes.CDLL(libname)
if c_lib is None:
console_error(f"Error opening {libname}")
except Exception as e:
console_error(f"Error loading {libname}: {e}")
# Set argument and return types for attach/detach functions
try:
# old attach/detach API
c_lib.attach.argtypes = [ctypes.c_uint]
except Exception as e:
console_debug(
"Error setting old attach/detach API argument "
f"types: {e}, trying new API"
)
try:
# new attach/detach API
c_lib.rocattach_attach.restype = ctypes.c_int
c_lib.rocattach_attach.argtypes = [ctypes.c_int]
c_lib.rocattach_detach.restype = ctypes.c_int
c_lib.rocattach_detach.argtypes = [ctypes.c_int]
except Exception as e:
console_error(
f"Error setting attach/detach function argument types: {e}"
)
pid = options["ROCPROF_ATTACH_PID"]
if pid is None:
console_error("Mode of attach/detach must have setup for process ID")
try:
# old attach/detach API
c_lib.attach(int(pid))
except Exception as e:
console_debug(f"Error attaching with old API: {e}, trying new API")
try:
# new attach/detach API
attach_status = c_lib.rocattach_attach(int(pid))
if attach_status != 0:
console_error(
f"Error attaching to process {pid}, "
f"rocattach_attach returned {attach_status}"
)
except Exception as e:
console_error(f"Error attaching to process {pid}: {e}")
duration = os.environ.get("ROCPROF_ATTACH_DURATION", None)
if duration is None:
console_log(
f"\033[93mAttach to process with ID {pid} is successful, "
"Press Enter to detach...\033[0m"
)
input()
else:
console_log(
f"\033[93mAttach to process with ID {pid} is successful, "
f"detach will happen in {duration} milliseconds...\033[0m"
)
time.sleep(int(duration) / 1000)
try:
# old attach/detach API
c_lib.detach(int(pid))
except Exception as e:
console_debug(f"Error detaching with old API: {e}, trying new API")
try:
# new attach/detach API
detach_status = c_lib.rocattach_detach(int(pid))
if detach_status != 0:
console_error(
f"Error detaching from process {pid}, "
f"rocattach_detach returned {detach_status}"
)
except Exception as e:
console_error(f"Error detaching from process {pid}: {e}")
def capture_subprocess_output( def capture_subprocess_output(
subprocess_args: list[str], subprocess_args: list[str],
new_env: Optional[dict[str, str]] = None, new_env: Optional[dict[str, str]] = None,
@@ -788,49 +883,7 @@ def run_prof(
console_debug(f"rocprof sdk env vars: {new_env}") console_debug(f"rocprof sdk env vars: {new_env}")
if is_mode_live_attach: if is_mode_live_attach:
perform_attach_detach(new_env, options)
@contextmanager
def temporary_env(env_vars: dict[str, str]) -> Generator[None, None, None]:
"""
Temporarily change the environment variable of this application.
"""
original_env = os.environ.copy()
os.environ.update({k: str(v) for k, v in env_vars.items()})
try:
yield
finally:
os.environ.clear()
os.environ.update(original_env)
with temporary_env(new_env):
libname = options["ROCPROF_ATTACH_LIBRARY"]
c_lib = ctypes.CDLL(libname)
if c_lib is None:
console_error(f"Error opening {libname}")
c_lib.attach.argtypes = [ctypes.c_uint]
pid = options["ROCPROF_ATTACH_PID"]
if pid is None:
console_error(
"Mode of attach/detach must have setup for process ID"
)
c_lib.attach(int(pid))
duration = os.environ.get("ROCPROF_ATTACH_DURATION", None)
if duration is None:
console_log(
f"\033[93mAttach to process with ID {pid} is successful, "
"Press Enter to detach...\033[0m"
)
input()
else:
console_log(
f"\033[93mAttach to process with ID {pid} is successful, "
f"detach will happen in {duration} milliseconds...\033[0m"
)
time.sleep(int(duration) / 1000)
c_lib.detach(int(pid))
else: else:
if app_cmd is None: if app_cmd is None:
console_error( console_error(
@@ -29,6 +29,7 @@ import re
import sqlite3 import sqlite3
import subprocess import subprocess
import sys import sys
import time
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
@@ -989,19 +990,19 @@ def test_analyze_rocpd(
Dispatch, Dispatch,
Kernel, Kernel,
Metadata, Metadata,
Metric, MetricDefinition,
MetricValue,
RooflineData, RooflineData,
Value,
Workload, Workload,
) )
table_name_map = { table_name_map = {
"compute_workload": Workload, "compute_workload": Workload,
"compute_metric": Metric, "compute_metric_definition": MetricDefinition,
"compute_roofline_data": RooflineData, "compute_roofline_data": RooflineData,
"compute_dispatch": Dispatch, "compute_dispatch": Dispatch,
"compute_kernel": Kernel, "compute_kernel": Kernel,
"compute_value": Value, "compute_metric_value": MetricValue,
"compute_metadata": Metadata, "compute_metadata": Metadata,
} }
@@ -2268,6 +2269,7 @@ def test_live_attach_detach_block(binary_handler_profile_rocprof_compute):
try: try:
# Start workload # Start workload
process_workload = subprocess.Popen(config["app_hip_dynamic_shared"], env=env) process_workload = subprocess.Popen(config["app_hip_dynamic_shared"], env=env)
time.sleep(5) # Give workload time to start
attach_detach = { attach_detach = {
"attach_pid": process_workload.pid, "attach_pid": process_workload.pid,
@@ -2316,8 +2318,9 @@ def test_live_attach_detach_block_thread_sleep(binary_handler_profile_rocprof_co
try: try:
# Start workload with sleep mode enabled # Start workload with sleep mode enabled
process_workload = subprocess.Popen( process_workload = subprocess.Popen(
[config["app_hip_dynamic_shared"], "--enable-sleep"], env=env [*config["app_hip_dynamic_shared"], "--enable-sleep"], env=env
) )
time.sleep(5) # Give workload time to start
attach_detach = { attach_detach = {
"attach_pid": process_workload.pid, "attach_pid": process_workload.pid,
@@ -2358,7 +2361,7 @@ def test_live_attach_detach_block_thread_sleep(binary_handler_profile_rocprof_co
@pytest.mark.live_attach_detach @pytest.mark.live_attach_detach
def test_live_attach_detach_singlepath_launch_stats( def test_live_attach_detach_singlepass_launch_stats(
binary_handler_profile_rocprof_compute, binary_handler_profile_rocprof_compute,
): ):
options = ["--set", "launch_stats"] options = ["--set", "launch_stats"]
@@ -2374,6 +2377,7 @@ def test_live_attach_detach_singlepath_launch_stats(
try: try:
# Start workload # Start workload
process_workload = subprocess.Popen(config["app_hip_dynamic_shared"], env=env) process_workload = subprocess.Popen(config["app_hip_dynamic_shared"], env=env)
time.sleep(5) # Give workload time to start
attach_detach = { attach_detach = {
"attach_pid": process_workload.pid, "attach_pid": process_workload.pid,
+14 -1
Visa fil
@@ -242,6 +242,19 @@ Full documentation for ROCprofiler-SDK is available at [rocm.docs.amd.com/projec
## ROCprofiler-SDK 1.1.0 for ROCm release 7.2 ## ROCprofiler-SDK 1.1.0 for ROCm release 7.2
### Added ### Added
- Strix halo support for counter collection. - Counter collection support for `gfx1150` and `gfx1151` (Strix Halo).
- HSA Extension API v8 support.
- `hipStreamCopyAttributes` API implementation.
### Optimized
- Improved process attachment and updated the corresponding [documentation](https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/how-to/using-rocprofv3-process-attachment.html).
- Improved [Quick reference guide for rocprofv3] (https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/quick_guide.html).
- Updated installation documentation with links to the latest repository (https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/install/installation.html).
### Resolved issues
- Fixed multi-GPU dimension mismatch.
- Fixed device lock issue for dispatch counters.
- Addressed OpenMP Tools task scheduling null pointer exception.
- Fixed stream ID errors arising during process attachment.
- Fixed issues arising during dynamic code object loading.
@@ -1004,6 +1004,11 @@ ROCPROFILER_ENUM_LABEL(ROCPROFILER_HIP_RUNTIME_API_ID_hipGetProcAddress_spt)
#if HIP_RUNTIME_API_TABLE_STEP_VERSION >= 20 #if HIP_RUNTIME_API_TABLE_STEP_VERSION >= 20
ROCPROFILER_ENUM_LABEL(ROCPROFILER_HIP_RUNTIME_API_ID_hipKernelGetParamInfo) ROCPROFILER_ENUM_LABEL(ROCPROFILER_HIP_RUNTIME_API_ID_hipKernelGetParamInfo)
#endif #endif
#if HIP_RUNTIME_API_TABLE_STEP_VERSION >= 21
ROCPROFILER_ENUM_LABEL(ROCPROFILER_HIP_RUNTIME_API_ID_hipExtDisableLogging)
ROCPROFILER_ENUM_LABEL(ROCPROFILER_HIP_RUNTIME_API_ID_hipExtEnableLogging)
ROCPROFILER_ENUM_LABEL(ROCPROFILER_HIP_RUNTIME_API_ID_hipExtSetLoggingParams)
#endif
#if HIP_RUNTIME_API_TABLE_STEP_VERSION == 0 #if HIP_RUNTIME_API_TABLE_STEP_VERSION == 0
static_assert(ROCPROFILER_HIP_RUNTIME_API_ID_LAST == 442); static_assert(ROCPROFILER_HIP_RUNTIME_API_ID_LAST == 442);
#elif HIP_RUNTIME_API_TABLE_STEP_VERSION == 1 #elif HIP_RUNTIME_API_TABLE_STEP_VERSION == 1
@@ -1046,6 +1051,8 @@ static_assert(ROCPROFILER_HIP_RUNTIME_API_ID_LAST == 506);
static_assert(ROCPROFILER_HIP_RUNTIME_API_ID_LAST == 507); static_assert(ROCPROFILER_HIP_RUNTIME_API_ID_LAST == 507);
#elif HIP_RUNTIME_API_TABLE_STEP_VERSION == 20 #elif HIP_RUNTIME_API_TABLE_STEP_VERSION == 20
static_assert(ROCPROFILER_HIP_RUNTIME_API_ID_LAST == 508); static_assert(ROCPROFILER_HIP_RUNTIME_API_ID_LAST == 508);
#elif HIP_RUNTIME_API_TABLE_STEP_VERSION == 21
static_assert(ROCPROFILER_HIP_RUNTIME_API_ID_LAST == 511);
#else #else
# if !defined(ROCPROFILER_UNSAFE_NO_VERSION_CHECK) && \ # if !defined(ROCPROFILER_UNSAFE_NO_VERSION_CHECK) && \
(defined(ROCPROFILER_CI) && ROCPROFILER_CI > 0) (defined(ROCPROFILER_CI) && ROCPROFILER_CI > 0)
@@ -3377,6 +3377,26 @@ typedef union rocprofiler_hip_api_args_t
size_t* paramSize; size_t* paramSize;
} hipKernelGetParamInfo; } hipKernelGetParamInfo;
#endif #endif
#if HIP_RUNTIME_API_TABLE_STEP_VERSION >= 21
struct
{
// Empty struct has a size of 0 in C but size of 1 in C++.
// Add the rocprofiler_hip_api_no_args struct to fix this
rocprofiler_hip_api_no_args no_args;
} hipExtDisableLogging;
struct
{
// Empty struct has a size of 0 in C but size of 1 in C++.
// Add the rocprofiler_hip_api_no_args struct to fix this
rocprofiler_hip_api_no_args no_args;
} hipExtEnableLogging;
struct
{
size_t log_level;
size_t log_size;
size_t log_mask;
} hipExtSetLoggingParams;
#endif
} rocprofiler_hip_api_args_t; } rocprofiler_hip_api_args_t;
ROCPROFILER_EXTERN_C_FINI ROCPROFILER_EXTERN_C_FINI
@@ -575,6 +575,11 @@ typedef enum rocprofiler_hip_runtime_api_id_t // NOLINT(performance-enum-size)
#endif #endif
#if HIP_RUNTIME_API_TABLE_STEP_VERSION >= 20 #if HIP_RUNTIME_API_TABLE_STEP_VERSION >= 20
ROCPROFILER_HIP_RUNTIME_API_ID_hipKernelGetParamInfo, ROCPROFILER_HIP_RUNTIME_API_ID_hipKernelGetParamInfo,
#endif
#if HIP_RUNTIME_API_TABLE_STEP_VERSION >= 21
ROCPROFILER_HIP_RUNTIME_API_ID_hipExtDisableLogging,
ROCPROFILER_HIP_RUNTIME_API_ID_hipExtEnableLogging,
ROCPROFILER_HIP_RUNTIME_API_ID_hipExtSetLoggingParams,
#endif #endif
ROCPROFILER_HIP_RUNTIME_API_ID_LAST, ROCPROFILER_HIP_RUNTIME_API_ID_LAST,
} rocprofiler_hip_runtime_api_id_t; } rocprofiler_hip_runtime_api_id_t;
@@ -623,6 +623,12 @@ ROCP_SDK_ENFORCE_ABI(::HipDispatchTable, hipGetProcAddress_spt_fn, 506);
ROCP_SDK_ENFORCE_ABI(::HipDispatchTable, hipKernelGetParamInfo_fn, 507); ROCP_SDK_ENFORCE_ABI(::HipDispatchTable, hipKernelGetParamInfo_fn, 507);
#endif #endif
#if HIP_RUNTIME_API_TABLE_STEP_VERSION >= 21
ROCP_SDK_ENFORCE_ABI(::HipDispatchTable, hipExtDisableLogging_fn, 508);
ROCP_SDK_ENFORCE_ABI(::HipDispatchTable, hipExtEnableLogging_fn, 509);
ROCP_SDK_ENFORCE_ABI(::HipDispatchTable, hipExtSetLoggingParams_fn, 510);
#endif
#if HIP_RUNTIME_API_TABLE_STEP_VERSION == 0 #if HIP_RUNTIME_API_TABLE_STEP_VERSION == 0
ROCP_SDK_ENFORCE_ABI_VERSIONING(::HipDispatchTable, 442) ROCP_SDK_ENFORCE_ABI_VERSIONING(::HipDispatchTable, 442)
#elif HIP_RUNTIME_API_TABLE_STEP_VERSION == 1 #elif HIP_RUNTIME_API_TABLE_STEP_VERSION == 1
@@ -665,6 +671,8 @@ ROCP_SDK_ENFORCE_ABI_VERSIONING(::HipDispatchTable, 506)
ROCP_SDK_ENFORCE_ABI_VERSIONING(::HipDispatchTable, 507) ROCP_SDK_ENFORCE_ABI_VERSIONING(::HipDispatchTable, 507)
#elif HIP_RUNTIME_API_TABLE_STEP_VERSION == 20 #elif HIP_RUNTIME_API_TABLE_STEP_VERSION == 20
ROCP_SDK_ENFORCE_ABI_VERSIONING(::HipDispatchTable, 508) ROCP_SDK_ENFORCE_ABI_VERSIONING(::HipDispatchTable, 508)
#elif HIP_RUNTIME_API_TABLE_STEP_VERSION == 21
ROCP_SDK_ENFORCE_ABI_VERSIONING(::HipDispatchTable, 511)
#else #else
INTERNAL_CI_ROCP_SDK_ENFORCE_ABI_VERSIONING(::HipDispatchTable, 0) INTERNAL_CI_ROCP_SDK_ENFORCE_ABI_VERSIONING(::HipDispatchTable, 0)
#endif #endif
@@ -650,6 +650,11 @@ HIP_API_INFO_DEFINITION_V(ROCPROFILER_HIP_TABLE_ID_Runtime, ROCPROFILER_HIP_RUNT
HIP_API_INFO_DEFINITION_V(ROCPROFILER_HIP_TABLE_ID_Runtime, ROCPROFILER_HIP_RUNTIME_API_ID_hipKernelGetParamInfo, hipKernelGetParamInfo, hipKernelGetParamInfo_fn, kernel, paramIndex, paramOffset, paramSize); HIP_API_INFO_DEFINITION_V(ROCPROFILER_HIP_TABLE_ID_Runtime, ROCPROFILER_HIP_RUNTIME_API_ID_hipKernelGetParamInfo, hipKernelGetParamInfo, hipKernelGetParamInfo_fn, kernel, paramIndex, paramOffset, paramSize);
#endif #endif
#if HIP_RUNTIME_API_TABLE_STEP_VERSION >= 21
HIP_API_INFO_DEFINITION_0(ROCPROFILER_HIP_TABLE_ID_Runtime, ROCPROFILER_HIP_RUNTIME_API_ID_hipExtDisableLogging, hipExtDisableLogging, hipExtDisableLogging_fn);
HIP_API_INFO_DEFINITION_0(ROCPROFILER_HIP_TABLE_ID_Runtime, ROCPROFILER_HIP_RUNTIME_API_ID_hipExtEnableLogging, hipExtEnableLogging, hipExtEnableLogging_fn);
HIP_API_INFO_DEFINITION_V(ROCPROFILER_HIP_TABLE_ID_Runtime, ROCPROFILER_HIP_RUNTIME_API_ID_hipExtSetLoggingParams, hipExtSetLoggingParams, hipExtSetLoggingParams_fn, log_level, log_size, log_mask);
#endif
// clang-format on // clang-format on
#else #else
@@ -1,3 +1,6 @@
# Copyright (c) Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
# #
# function for # function for
# #
@@ -142,11 +145,27 @@ function(rocprofiler_systems_causal_example_executable _NAME)
endif() endif()
if(ROCPROFSYS_INSTALL_EXAMPLES) if(ROCPROFSYS_INSTALL_EXAMPLES)
install( set(_TARGETS
TARGETS ${_NAME} ${_NAME}-rocprofsys ${_NAME}-coz ${_NAME}
DESTINATION bin ${_NAME}-rocprofsys
COMPONENT rocprofiler-systems-examples ${_NAME}-ndebug
OPTIONAL ${_NAME}-rocprofsys-ndebug
${_NAME}-coz
) )
set(_EXISTING_TARGETS)
foreach(_TARGET IN LISTS _TARGETS)
if(TARGET ${_TARGET})
list(APPEND _EXISTING_TARGETS ${_TARGET})
endif()
endforeach()
if(_EXISTING_TARGETS)
install(
TARGETS ${_EXISTING_TARGETS}
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
endif()
endif() endif()
endfunction() endfunction()
@@ -1,3 +1,6 @@
# Copyright (c) Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
cmake_minimum_required(VERSION 3.21 FATAL_ERROR) cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(rocprofiler-systems-code-coverage-example LANGUAGES CXX) project(rocprofiler-systems-code-coverage-example LANGUAGES CXX)
@@ -22,7 +25,11 @@ target_link_libraries(code-coverage PRIVATE Threads::Threads)
target_compile_options(code-coverage PRIVATE ${_FLAGS}) target_compile_options(code-coverage PRIVATE ${_FLAGS})
if(ROCPROFSYS_INSTALL_EXAMPLES) if(ROCPROFSYS_INSTALL_EXAMPLES)
install(TARGETS code-coverage DESTINATION bin COMPONENT rocprofiler-systems-examples) install(
TARGETS code-coverage
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
endif() endif()
set(PYTHON_FILES code-coverage.py) set(PYTHON_FILES code-coverage.py)
@@ -42,7 +49,7 @@ if(Python3_FOUND)
if(ROCPROFSYS_INSTALL_EXAMPLES) if(ROCPROFSYS_INSTALL_EXAMPLES)
install( install(
PROGRAMS ${PROJECT_BINARY_DIR}/${_FILE} PROGRAMS ${PROJECT_BINARY_DIR}/${_FILE}
DESTINATION bin DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples COMPONENT rocprofiler-systems-examples
) )
endif() endif()
@@ -1,3 +1,6 @@
# Copyright (c) Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
cmake_minimum_required(VERSION 3.21 FATAL_ERROR) cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(rocprofiler-systems-fork LANGUAGES CXX) project(rocprofiler-systems-fork LANGUAGES CXX)
@@ -18,7 +21,13 @@ target_link_libraries(
target_compile_options(fork-example PRIVATE ${_FLAGS}) target_compile_options(fork-example PRIVATE ${_FLAGS})
if(ROCPROFSYS_INSTALL_EXAMPLES) if(ROCPROFSYS_INSTALL_EXAMPLES)
install(TARGETS fork-example DESTINATION bin COMPONENT rocprofiler-systems-examples) if(TARGET fork-example)
install(
TARGETS fork-example
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
endif()
endif() endif()
# HIP fork example (multi-process concurrency test) # HIP fork example (multi-process concurrency test)
@@ -81,10 +90,10 @@ if(HIPCC_EXECUTABLE)
rocprofiler_systems_custom_compilation(COMPILER ${HIPCC_EXECUTABLE} TARGET hipMallocConcurrencyMproc) rocprofiler_systems_custom_compilation(COMPILER ${HIPCC_EXECUTABLE} TARGET hipMallocConcurrencyMproc)
endif() endif()
if(ROCPROFSYS_INSTALL_EXAMPLES) if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET hipMallocConcurrencyMproc)
install( install(
TARGETS hipMallocConcurrencyMproc TARGETS hipMallocConcurrencyMproc
DESTINATION bin DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples COMPONENT rocprofiler-systems-examples
) )
endif() endif()
@@ -1,25 +1,5 @@
################################################################################ # Copyright (c) Advanced Micro Devices, Inc.
# Copyright (c) 2024 - 2025 Advanced Micro Devices, Inc. # SPDX-License-Identifier: MIT
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
################################################################################
cmake_minimum_required(VERSION 3.21 FATAL_ERROR) cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
@@ -157,11 +137,15 @@ if(HIP_FOUND AND rocjpeg_FOUND AND Threads_FOUND AND rocprofiler-register_FOUND)
target_compile_options(jpegdecode PRIVATE ${_FLAGS}) target_compile_options(jpegdecode PRIVATE ${_FLAGS})
copy_image_files_and_make_copies() copy_image_files_and_make_copies()
if(ROCPROFSYS_INSTALL_EXAMPLES) if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET jpegdecode)
install(TARGETS jpegdecode DESTINATION bin COMPONENT rocprofiler-systems-examples)
install( install(
FILES ${CMAKE_BINARY_DIR}/images TARGETS jpegdecode
DESTINATION share/rocprofiler-systems/tests/images DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
install(
DIRECTORY ${CMAKE_BINARY_DIR}/images/
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples/images
COMPONENT rocprofiler-systems-examples COMPONENT rocprofiler-systems-examples
) )
endif() endif()
@@ -1,3 +1,6 @@
# Copyright (c) Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
cmake_minimum_required(VERSION 3.21 FATAL_ERROR) cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(rocprofiler-systems-lulesh-example LANGUAGES C CXX) project(rocprofiler-systems-lulesh-example LANGUAGES C CXX)
@@ -82,16 +85,11 @@ rocprofiler_systems_causal_example_executable(
INCLUDE_DIRECTORIES ${PROJECT_SOURCE_DIR}/includes INCLUDE_DIRECTORIES ${PROJECT_SOURCE_DIR}/includes
) )
if(ROCPROFSYS_INSTALL_EXAMPLES) if(ROCPROFSYS_INSTALL_EXAMPLES AND LULESH_BUILD_KOKKOS)
if(LULESH_BUILD_KOKKOS) install(
install( TARGETS kokkoscore kokkoscontainers kokkossimd
TARGETS kokkoscore kokkoscontainers LIBRARY
DESTINATION ${CMAKE_INSTALL_LIBDIR} DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples/lib
COMPONENT rocprofiler-systems-examples COMPONENT rocprofiler-systems-examples
) )
set_target_properties(
lulesh
PROPERTIES INSTALL_RPATH "\$ORIGIN/../${CMAKE_INSTALL_LIBDIR}"
)
endif()
endif() endif()
@@ -1,3 +1,6 @@
# Copyright (c) Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
cmake_minimum_required(VERSION 3.21 FATAL_ERROR) cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(rocprofiler-systems-mpi-examples LANGUAGES C CXX) project(rocprofiler-systems-mpi-examples LANGUAGES C CXX)
@@ -77,16 +80,24 @@ add_executable(mpi-example mpi.cpp)
target_link_libraries(mpi-example PRIVATE mpi-cxx-interface-library) target_link_libraries(mpi-example PRIVATE mpi-cxx-interface-library)
if(ROCPROFSYS_INSTALL_EXAMPLES) if(ROCPROFSYS_INSTALL_EXAMPLES)
install( set(MPI_EXAMPLES
TARGETS mpi-example
mpi-example mpi-allgather
mpi-allgather mpi-bcast
mpi-bcast mpi-all2all
mpi-all2all mpi-reduce
mpi-reduce mpi-scatter-gather
mpi-scatter-gather mpi-send-recv
mpi-send-recv mpi-allreduce
DESTINATION bin
COMPONENT rocprofiler-systems-examples
) )
foreach(MPI_EXAMPLE IN LISTS MPI_EXAMPLES)
if(TARGET ${MPI_EXAMPLE})
install(
TARGETS ${MPI_EXAMPLE}
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
endif()
endforeach()
endif() endif()
@@ -89,13 +89,17 @@ endif()
target_link_libraries(openmp-cg PRIVATE openmp-common) target_link_libraries(openmp-cg PRIVATE openmp-common)
target_link_libraries(openmp-lu PRIVATE openmp-common) target_link_libraries(openmp-lu PRIVATE openmp-common)
if(ROCPROFSYS_INSTALL_EXAMPLES) set(OPENMP_EXAMPLES openmp-cg openmp-lu)
install(
TARGETS openmp-cg openmp-lu foreach(OPENMP_EXAMPLE IN LISTS OPENMP_EXAMPLES)
DESTINATION bin if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET ${OPENMP_EXAMPLE})
COMPONENT rocprofiler-systems-examples install(
) TARGETS ${OPENMP_EXAMPLE}
endif() DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
endif()
endforeach()
set(DEFAULT_GPU_TARGETS set(DEFAULT_GPU_TARGETS
"gfx900" "gfx900"
@@ -280,3 +280,19 @@ set(ROCPROFSYS_OMPVV_OFFLOAD_TESTS
rocprofiler_systems_message(STATUS rocprofiler_systems_message(STATUS
"Successfully configured OMPVV" "Successfully configured OMPVV"
) )
if(ROCPROFSYS_INSTALL_EXAMPLES)
foreach(
test_target
IN
LISTS ROCPROFSYS_OMPVV_HOST_TESTS ROCPROFSYS_OMPVV_OFFLOAD_TESTS
)
if(TARGET "${test_target}-build")
install(
PROGRAMS "${OMPVV_BIN_DEST}/${test_target}"
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
endif()
endforeach()
endif()
@@ -71,7 +71,7 @@ endif()
# Use the directory that actually contains the library we found # Use the directory that actually contains the library we found
get_filename_component(_rocm_llvm_lib "${LIBOMPTARGET_SO}" DIRECTORY) get_filename_component(_rocm_llvm_lib "${LIBOMPTARGET_SO}" DIRECTORY)
set(_rocm_clang_lib "${ROCM_ROOT_DIR}/lib") set(_rocm_clang_lib "${ROCM_ROOT_DIR}/lib")
set(_COMMON_RPATH "${_rocm_llvm_lib};${_rocm_clang_lib};$ORIGIN") set(_COMMON_RPATH "${_rocm_llvm_lib};${_rocm_clang_lib};$ORIGIN;$ORIGIN/lib")
if(ROCmVersion_DIR) if(ROCmVersion_DIR)
list(APPEND _COMMON_RPATH "${ROCmVersion_DIR}/llvm/lib") list(APPEND _COMMON_RPATH "${ROCmVersion_DIR}/llvm/lib")
endif() endif()
@@ -126,3 +126,15 @@ rocprofiler_systems_custom_compilation(
rocprofiler_systems_custom_compilation(TARGET openmp-target rocprofiler_systems_custom_compilation(TARGET openmp-target
COMPILER ${OMP_TARGET_COMPILER} COMPILER ${OMP_TARGET_COMPILER}
) )
if(ROCPROFSYS_INSTALL_EXAMPLES)
if(TARGET openmp-target AND TARGET openmp-target-lib)
install(
TARGETS openmp-target openmp-target-lib
RUNTIME DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
LIBRARY
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples/lib
COMPONENT rocprofiler-systems-examples
)
endif()
endif()
@@ -1,3 +1,6 @@
# Copyright (c) Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
cmake_minimum_required(VERSION 3.21 FATAL_ERROR) cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(rocprofiler-systems-parallel-overhead-example LANGUAGES CXX) project(rocprofiler-systems-parallel-overhead-example LANGUAGES CXX)
@@ -32,10 +35,13 @@ target_link_libraries(
) )
target_compile_definitions(parallel-overhead-locks PRIVATE USE_LOCKS=1) target_compile_definitions(parallel-overhead-locks PRIVATE USE_LOCKS=1)
if(ROCPROFSYS_INSTALL_EXAMPLES) set(PARALLEL_OVERHEAD_EXAMPLES parallel-overhead parallel-overhead-locks)
install( foreach(PARALLEL_OVERHEAD_EXAMPLE IN LISTS PARALLEL_OVERHEAD_EXAMPLES)
TARGETS parallel-overhead parallel-overhead-locks if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET ${PARALLEL_OVERHEAD_EXAMPLE})
DESTINATION bin install(
COMPONENT rocprofiler-systems-examples TARGETS ${PARALLEL_OVERHEAD_EXAMPLE}
) DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
endif() COMPONENT rocprofiler-systems-examples
)
endif()
endforeach()
@@ -1,3 +1,6 @@
# Copyright (c) Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
cmake_minimum_required(VERSION 3.21 FATAL_ERROR) cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(rocprofiler-systems-python) project(rocprofiler-systems-python)
@@ -36,7 +39,7 @@ if(Python3_FOUND)
if(ROCPROFSYS_INSTALL_EXAMPLES) if(ROCPROFSYS_INSTALL_EXAMPLES)
install( install(
PROGRAMS ${PROJECT_BINARY_DIR}/${_FILE} PROGRAMS ${PROJECT_BINARY_DIR}/${_FILE}
DESTINATION bin DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples COMPONENT rocprofiler-systems-examples
) )
endif() endif()
@@ -138,6 +138,14 @@ if(hip_FOUND AND rccl_FOUND)
add_dependencies(rccl-tests::${_EXE_NAME} copy-${_EXE_NAME}) add_dependencies(rccl-tests::${_EXE_NAME} copy-${_EXE_NAME})
list(APPEND _RCCL_TEST_TARGETS "rccl-tests::${_EXE_NAME}") list(APPEND _RCCL_TEST_TARGETS "rccl-tests::${_EXE_NAME}")
if(ROCPROFSYS_INSTALL_EXAMPLES)
install(
PROGRAMS ${_EXE_DEST_PATH}
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
endif()
endforeach() endforeach()
set(RCCL_TEST_TARGETS "${_RCCL_TEST_TARGETS}" CACHE INTERNAL "rccl-test targets") set(RCCL_TEST_TARGETS "${_RCCL_TEST_TARGETS}" CACHE INTERNAL "rccl-test targets")
@@ -1,3 +1,6 @@
# Copyright (c) Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
cmake_minimum_required(VERSION 3.21 FATAL_ERROR) cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(rocprofiler-systems-rewrite-caller-example LANGUAGES CXX) project(rocprofiler-systems-rewrite-caller-example LANGUAGES CXX)
@@ -18,6 +21,10 @@ set(CMAKE_BUILD_TYPE "Debug")
add_executable(rewrite-caller rewrite-caller.cpp) add_executable(rewrite-caller rewrite-caller.cpp)
target_compile_options(rewrite-caller PRIVATE ${_FLAGS}) target_compile_options(rewrite-caller PRIVATE ${_FLAGS})
if(ROCPROFSYS_INSTALL_EXAMPLES) if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET rewrite-caller)
install(TARGETS rewrite-caller DESTINATION bin COMPONENT rocprofiler-systems-examples) install(
TARGETS rewrite-caller
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
endif() endif()
@@ -1,24 +1,5 @@
# MIT License # Copyright (c) Advanced Micro Devices, Inc.
# # SPDX-License-Identifier: MIT
# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
cmake_minimum_required(VERSION 3.21 FATAL_ERROR) cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
@@ -113,6 +94,10 @@ if(NOT CMAKE_CXX_COMPILER_IS_HIPCC AND HIPCC_EXECUTABLE)
rocprofiler_systems_custom_compilation(COMPILER ${HIPCC_EXECUTABLE} TARGET roctx) rocprofiler_systems_custom_compilation(COMPILER ${HIPCC_EXECUTABLE} TARGET roctx)
endif() endif()
if(ROCPROFSYS_INSTALL_EXAMPLES) if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET roctx)
install(TARGETS roctx DESTINATION bin COMPONENT rocprofiler-systems-examples) install(
TARGETS roctx
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
endif() endif()
@@ -40,3 +40,11 @@ target_compile_options(tests-compile-options INTERFACE -g)
add_executable(thread-limit thread-limit.cpp) add_executable(thread-limit thread-limit.cpp)
target_compile_definitions(thread-limit PRIVATE MAX_THREADS=${ROCPROFSYS_MAX_THREADS}) target_compile_definitions(thread-limit PRIVATE MAX_THREADS=${ROCPROFSYS_MAX_THREADS})
target_link_libraries(thread-limit PRIVATE Threads::Threads tests-compile-options) target_link_libraries(thread-limit PRIVATE Threads::Threads tests-compile-options)
if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET thread-limit)
install(
TARGETS thread-limit
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
endif()
@@ -1,3 +1,6 @@
# Copyright (c) Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
cmake_minimum_required(VERSION 3.21 FATAL_ERROR) cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(rocprofiler-systems-trace-time-window-example LANGUAGES CXX) project(rocprofiler-systems-trace-time-window-example LANGUAGES CXX)
@@ -18,10 +21,10 @@ set(CMAKE_BUILD_TYPE "Debug")
add_executable(trace-time-window trace-time-window.cpp) add_executable(trace-time-window trace-time-window.cpp)
target_compile_options(trace-time-window PRIVATE ${_FLAGS}) target_compile_options(trace-time-window PRIVATE ${_FLAGS})
if(ROCPROFSYS_INSTALL_EXAMPLES) if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET trace-time-window)
install( install(
TARGETS trace-time-window TARGETS trace-time-window
DESTINATION bin DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples COMPONENT rocprofiler-systems-examples
) )
endif() endif()
@@ -1,3 +1,6 @@
# Copyright (c) Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
cmake_minimum_required(VERSION 3.21 FATAL_ERROR) cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(rocprofiler-systems-transferBench-example LANGUAGES CXX) project(rocprofiler-systems-transferBench-example LANGUAGES CXX)
@@ -120,6 +123,10 @@ if(NOT CMAKE_CXX_COMPILER_IS_HIPCC AND HIPCC_EXECUTABLE)
rocprofiler_systems_custom_compilation(COMPILER ${HIPCC_EXECUTABLE} TARGET transferBench) rocprofiler_systems_custom_compilation(COMPILER ${HIPCC_EXECUTABLE} TARGET transferBench)
endif() endif()
if(ROCPROFSYS_INSTALL_EXAMPLES) if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET transferBench)
install(TARGETS transferBench DESTINATION bin COMPONENT rocprofiler-systems-examples) install(
TARGETS transferBench
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
endif() endif()
@@ -1,3 +1,6 @@
# Copyright (c) Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
cmake_minimum_required(VERSION 3.21 FATAL_ERROR) cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(rocprofiler-systems-transpose-example LANGUAGES CXX) project(rocprofiler-systems-transpose-example LANGUAGES CXX)
@@ -93,6 +96,10 @@ if(NOT CMAKE_CXX_COMPILER_IS_HIPCC AND HIPCC_EXECUTABLE)
rocprofiler_systems_custom_compilation(COMPILER ${HIPCC_EXECUTABLE} TARGET transpose) rocprofiler_systems_custom_compilation(COMPILER ${HIPCC_EXECUTABLE} TARGET transpose)
endif() endif()
if(ROCPROFSYS_INSTALL_EXAMPLES) if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET transpose)
install(TARGETS transpose DESTINATION bin COMPONENT rocprofiler-systems-examples) install(
TARGETS transpose
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
endif() endif()
@@ -1,3 +1,6 @@
# Copyright (c) Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
cmake_minimum_required(VERSION 3.21 FATAL_ERROR) cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(rocprofiler-systems-user-api-example LANGUAGES CXX) project(rocprofiler-systems-user-api-example LANGUAGES CXX)
@@ -24,6 +27,10 @@ target_link_libraries(
PRIVATE Threads::Threads rocprofiler-systems::rocprofiler-systems-user-library PRIVATE Threads::Threads rocprofiler-systems::rocprofiler-systems-user-library
) )
if(ROCPROFSYS_INSTALL_EXAMPLES) if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET user-api)
install(TARGETS user-api DESTINATION bin COMPONENT rocprofiler-systems-examples) install(
TARGETS user-api
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
endif() endif()
@@ -1,3 +1,6 @@
# Copyright (c) Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
cmake_minimum_required(VERSION 3.21 FATAL_ERROR) cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(rocprofiler-systems-videodecode-example LANGUAGES CXX) project(rocprofiler-systems-videodecode-example LANGUAGES CXX)
@@ -176,15 +179,15 @@ if(FFMPEG_FOUND AND rocdecode_FOUND)
target_compile_definitions(videodecode PUBLIC USE_AVCODEC_GREATER_THAN_58_134=1) target_compile_definitions(videodecode PUBLIC USE_AVCODEC_GREATER_THAN_58_134=1)
endif() endif()
if(ROCPROFSYS_INSTALL_EXAMPLES) if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET videodecode)
install( install(
TARGETS videodecode TARGETS videodecode
DESTINATION bin DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples COMPONENT rocprofiler-systems-examples
) )
install( install(
FILES ${CMAKE_BINARY_DIR}/videos DIRECTORY ${CMAKE_BINARY_DIR}/videos/
DESTINATION share/rocprofiler-systems/tests/videos DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples/videos
COMPONENT rocprofiler-systems-examples COMPONENT rocprofiler-systems-examples
) )
endif() endif()
@@ -192,6 +192,10 @@ prepare_environment_for_run(parser_data_t& _data)
rocprofsys::argparse::add_ld_preload(_data); rocprofsys::argparse::add_ld_preload(_data);
rocprofsys::argparse::add_ld_library_path(_data); rocprofsys::argparse::add_ld_library_path(_data);
} }
rocprofsys::argparse::add_torch_library_path(_data, _data.verbose > 0);
rocprofsys::common::consolidate_env_entries(_data.current);
} }
void void
@@ -933,3 +933,9 @@ parse_args(int argc, char** argv, std::vector<char*>& _env)
return _outv; return _outv;
} }
void
add_torch_library_path(std::vector<char*>& envp, const std::vector<char*>& argv)
{
rocprofsys::common::add_torch_library_path(envp, argv, verbose > 0, updated_envs);
}
@@ -51,6 +51,8 @@ main(int argc, char** argv)
_argv.emplace_back(argv[i]); _argv.emplace_back(argv[i]);
} }
add_torch_library_path(_env, _argv);
print_updated_environment(_env); print_updated_environment(_env);
if(!_argv.empty()) if(!_argv.empty())
@@ -35,3 +35,6 @@ get_initial_environment();
std::vector<char*> std::vector<char*>
parse_args(int argc, char** argv, std::vector<char*>& envp); parse_args(int argc, char** argv, std::vector<char*>& envp);
void
add_torch_library_path(std::vector<char*>& envp, const std::vector<char*>& argv);
@@ -26,9 +26,12 @@
#include "common/join.hpp" #include "common/join.hpp"
#include <algorithm> #include <algorithm>
#include <cctype>
#include <cstdio> #include <cstdio>
#include <cstdlib> #include <cstdlib>
#include <cstring> #include <cstring>
#include <numeric>
#include <sstream>
#include <stdexcept> #include <stdexcept>
#include <string> #include <string>
#include <string_view> #include <string_view>
@@ -197,7 +200,7 @@ remove_env(std::vector<char*>& _environ, std::string_view _env_var,
{ {
if(match(itr)) if(match(itr))
{ {
free(itr); std::free(itr);
itr = nullptr; itr = nullptr;
} }
} }
@@ -266,6 +269,113 @@ discover_llvm_libdir_for_ompt(bool verbose = false)
return {}; return {};
} }
inline bool
is_python_interpreter(std::string_view executable)
{
if(executable.empty()) return false;
const auto slash_pos = executable.rfind('/');
const auto basename = (slash_pos != std::string_view::npos)
? executable.substr(slash_pos + 1)
: executable;
if(basename == "python" || basename == "python3") return true;
constexpr std::string_view python3_prefix = "python3.";
const bool has_valid_prefix =
basename.size() > python3_prefix.size() &&
basename.substr(0, python3_prefix.size()) == python3_prefix;
if(!has_valid_prefix) return false;
const auto version_digits = basename.substr(python3_prefix.size());
return std::all_of(version_digits.begin(), version_digits.end(),
[](unsigned char c) { return std::isdigit(c); });
}
inline std::string
discover_torch_libpath(const std::string& python_binary, bool verbose = false)
{
if(python_binary.empty()) return {};
const auto is_safe_executable_path = [](const std::string& path) {
// Allow only a conservative set of characters in the executable path to
// avoid injection when used in a shell command.
for(unsigned char c : path)
{
if(std::isalnum(c) != 0) continue;
switch(c)
{
case '/':
case '.':
case '_':
case '-':
case '+': break;
default: return false;
}
}
return true;
};
if(!is_safe_executable_path(python_binary))
{
ROCPROFSYS_ENVIRON_LOG(
verbose, "Unsafe characters detected in Python interpreter path: %s\n",
python_binary.c_str());
return {};
}
const auto cmd = "\"" + python_binary +
"\" -c \"import torch; print(torch.__path__[0])\" 2>/dev/null";
FILE* pipe = popen(cmd.c_str(), "r");
if(!pipe)
{
ROCPROFSYS_ENVIRON_LOG(verbose, "Failed to execute command: %s\n", cmd.c_str());
return {};
}
char buffer[1024];
std::string result;
while(fgets(buffer, sizeof(buffer), pipe))
{
result.append(buffer);
// stop if we've read the full line (torch path is printed on a single line)
if(!result.empty() && result.back() == '\n') break;
}
int status = pclose(pipe);
if(status != 0 || result.empty())
{
ROCPROFSYS_ENVIRON_LOG(verbose, "torch not found for Python interpreter: %s\n",
python_binary.c_str());
return {};
}
while(!result.empty() &&
(result.back() == '\n' || result.back() == '\r' || result.back() == ' '))
{
result.pop_back();
}
if(result.empty()) return {};
std::string torch_libdir = result + "/lib";
if(!::tim::filepath::direxists(torch_libdir))
{
ROCPROFSYS_ENVIRON_LOG(verbose, "torch lib directory does not exist: %s\n",
torch_libdir.c_str());
return {};
}
ROCPROFSYS_ENVIRON_LOG(verbose, "Discovered torch library path: %s\n",
torch_libdir.c_str());
return torch_libdir;
}
enum class update_mode : uint8_t enum class update_mode : uint8_t
{ {
REPLACE = 0, REPLACE = 0,
@@ -335,7 +445,7 @@ update_env(std::vector<char*>& _environ, std::string_view _env_var, Tp&& _env_va
} }
else else
{ {
free(itr); std::free(itr);
itr = strdup(join('=', _env_var, _env_val_str).c_str()); itr = strdup(join('=', _env_var, _env_val_str).c_str());
} }
return; return;
@@ -343,5 +453,145 @@ update_env(std::vector<char*>& _environ, std::string_view _env_var, Tp&& _env_va
_environ.emplace_back(strdup(join('=', _env_var, _env_val_str).c_str())); _environ.emplace_back(strdup(join('=', _env_var, _env_val_str).c_str()));
} }
template <typename UpdatedEnvsT>
inline void
add_torch_library_path(std::vector<char*>& envp, const std::vector<char*>& argv,
bool verbose, UpdatedEnvsT& updated_envs)
{
if(argv.empty() || argv.front() == nullptr) return;
if(!is_python_interpreter(argv.front())) return;
auto torch_libpath = discover_torch_libpath(argv.front(), verbose);
if(torch_libpath.empty()) return;
std::unordered_set<std::string> seen{ torch_libpath };
std::string result = torch_libpath;
constexpr std::string_view ld_prefix = "LD_LIBRARY_PATH=";
auto is_ld_path = [&](char* entry) {
return entry &&
std::string_view{ entry }.substr(0, ld_prefix.length()) == ld_prefix;
};
for(auto& entry : envp)
{
if(!is_ld_path(entry)) continue;
std::istringstream stream{ std::string{ entry + ld_prefix.length() } };
for(std::string path; std::getline(stream, path, ':');)
{
if(!path.empty() && seen.insert(path).second) result += ":" + path;
}
std::free(entry);
entry = nullptr;
}
envp.erase(std::remove(envp.begin(), envp.end(), nullptr), envp.end());
envp.emplace_back(strdup(join("", ld_prefix, result).c_str()));
updated_envs.emplace(ld_prefix.substr(0, ld_prefix.length() - 1));
}
inline void
consolidate_env_entries(std::vector<char*>& envp)
{
constexpr char delim = ':';
struct key_data
{
std::vector<std::string> parts;
std::unordered_set<std::string> seen;
void add_unique(std::string part)
{
if(!part.empty() && seen.insert(part).second)
parts.emplace_back(std::move(part));
}
};
auto parse_entry = [](std::string_view entry)
-> std::optional<std::pair<std::string_view, std::string_view>> {
auto eq_pos = entry.find('=');
if(eq_pos == std::string_view::npos) return std::nullopt;
return std::make_pair(entry.substr(0, eq_pos), entry.substr(eq_pos + 1));
};
auto join_parts = [delim](std::string_view key,
const std::vector<std::string>& parts) {
std::string result;
const auto total_parts_length = std::accumulate(
parts.begin(), parts.end(), std::size_t{ 0 },
[](std::size_t acc, const std::string& part) { return acc + part.size(); });
const auto delim_count = parts.size() - 1;
const auto equal_sign_length = 1;
result.reserve(key.size() + equal_sign_length + total_parts_length + delim_count);
result.append(key);
result += '=';
result =
std::accumulate(parts.begin(), parts.end(), std::move(result),
[delim, &parts](std::string acc, const std::string& part) {
if(part != parts.front()) acc += delim;
acc.append(part);
return acc;
});
return result;
};
std::unordered_map<std::string_view, key_data> key_map;
std::vector<std::string_view> key_order;
for(auto* entry : envp)
{
if(!entry)
{
continue;
}
auto parsed = parse_entry(entry);
if(!parsed)
{
continue;
}
auto [key, value] = *parsed;
auto [it, inserted] = key_map.try_emplace(key);
if(inserted)
{
key_order.emplace_back(key);
}
auto& data = it->second;
std::istringstream stream{ std::string{ value } };
for(std::string part; std::getline(stream, part, delim);)
{
data.add_unique(part);
}
}
std::vector<char*> result;
result.reserve(key_order.size());
for(auto key : key_order)
{
result.emplace_back(strdup(join_parts(key, key_map[key].parts).c_str()));
}
for(auto* entry : envp)
{
std::free(entry);
entry = nullptr;
}
envp = std::move(result);
}
} // namespace common } // namespace common
} // namespace rocprofsys } // namespace rocprofsys
@@ -24,6 +24,7 @@ add_library(
lib-common-tests lib-common-tests
OBJECT OBJECT
test_discover_llvm_libdir.cpp test_discover_llvm_libdir.cpp
test_environment.cpp
test_path.cpp test_path.cpp
test_remove_env.cpp test_remove_env.cpp
test_update_env.cpp test_update_env.cpp
@@ -0,0 +1,146 @@
// Copyright (c) Advanced Micro Devices, Inc.
// SPDX-License-Identifier: MIT
#include "common/environment.hpp"
#include <gtest/gtest.h>
using namespace rocprofsys::common;
class IsPythonInterpreterTest : public ::testing::Test
{};
TEST_F(IsPythonInterpreterTest, RecognizesPython)
{
EXPECT_TRUE(is_python_interpreter("python"));
EXPECT_TRUE(is_python_interpreter("python3"));
EXPECT_TRUE(is_python_interpreter("python3.8"));
EXPECT_TRUE(is_python_interpreter("python3.9"));
EXPECT_TRUE(is_python_interpreter("python3.10"));
EXPECT_TRUE(is_python_interpreter("python3.11"));
EXPECT_TRUE(is_python_interpreter("python3.12"));
EXPECT_TRUE(is_python_interpreter("/usr/bin/python"));
EXPECT_TRUE(is_python_interpreter("/usr/bin/python3"));
EXPECT_TRUE(is_python_interpreter("/usr/bin/python3.10"));
EXPECT_TRUE(is_python_interpreter("/home/user/venv/bin/python"));
EXPECT_TRUE(is_python_interpreter("/opt/conda/bin/python3.11"));
EXPECT_FALSE(is_python_interpreter("bash"));
EXPECT_FALSE(is_python_interpreter("sh"));
EXPECT_FALSE(is_python_interpreter("ruby"));
EXPECT_FALSE(is_python_interpreter("node"));
EXPECT_FALSE(is_python_interpreter("java"));
EXPECT_FALSE(is_python_interpreter("/usr/bin/bash"));
EXPECT_FALSE(is_python_interpreter("./my_app"));
EXPECT_FALSE(is_python_interpreter("pythonista"));
EXPECT_FALSE(is_python_interpreter("python_script.py"));
EXPECT_FALSE(is_python_interpreter("mypython"));
EXPECT_FALSE(is_python_interpreter("python2"));
EXPECT_FALSE(is_python_interpreter("python3."));
EXPECT_FALSE(is_python_interpreter("python3.a"));
EXPECT_FALSE(is_python_interpreter("python3.10a"));
EXPECT_FALSE(is_python_interpreter("python3x10"));
EXPECT_FALSE(is_python_interpreter(""));
EXPECT_FALSE(is_python_interpreter("/usr/bin/"));
}
class DuplicatedEnvironmentEntriesTest : public ::testing::Test
{};
TEST_F(DuplicatedEnvironmentEntriesTest, DuplicateEnvironmentEntries)
{
std::vector<char*> env_vars = {
strdup("PATH=/usr/local/bin:/usr/bin:/bin:/usr/local/bin2"),
strdup("PATH=/usr/local/bin:/usr/bin:/bin"),
};
consolidate_env_entries(env_vars);
ASSERT_EQ(env_vars.size(), 1);
EXPECT_STREQ(env_vars[0], "PATH=/usr/local/bin:/usr/bin:/bin:/usr/local/bin2");
for(auto* entry : env_vars)
free(entry);
}
TEST_F(DuplicatedEnvironmentEntriesTest, HandlesEmptyVector)
{
std::vector<char*> env_vars;
consolidate_env_entries(env_vars);
EXPECT_TRUE(env_vars.empty());
}
TEST_F(DuplicatedEnvironmentEntriesTest, HandlesNullEntries)
{
std::vector<char*> env_vars = {
strdup("PATH=/usr/bin"),
nullptr,
strdup("PATH=/bin"),
};
consolidate_env_entries(env_vars);
ASSERT_EQ(env_vars.size(), 1);
EXPECT_STREQ(env_vars[0], "PATH=/usr/bin:/bin");
for(auto* entry : env_vars)
std::free(entry);
}
TEST_F(DuplicatedEnvironmentEntriesTest, HandlesEmptyValues)
{
std::vector<char*> env_vars = {
strdup("EMPTY_VAR="),
strdup("PATH=/usr/bin"),
};
consolidate_env_entries(env_vars);
ASSERT_EQ(env_vars.size(), 2);
for(auto* entry : env_vars)
std::free(entry);
}
class AddTorchLibraryPathTest : public ::testing::Test
{
protected:
std::unordered_set<std::string> updated_envs;
};
TEST_F(AddTorchLibraryPathTest, SkipsNonPythonExecutables)
{
std::vector<char*> envp = {
strdup("LD_LIBRARY_PATH=/usr/lib"),
};
std::vector<char*> argv = {
strdup("/usr/bin/bash"),
};
add_torch_library_path(envp, argv, false, updated_envs);
// Should not modify environment
ASSERT_EQ(envp.size(), 1);
EXPECT_STREQ(envp[0], "LD_LIBRARY_PATH=/usr/lib");
for(auto* entry : envp)
std::free(entry);
for(auto* entry : argv)
std::free(entry);
}
TEST_F(AddTorchLibraryPathTest, HandlesEmptyArgv)
{
std::vector<char*> envp = {
strdup("LD_LIBRARY_PATH=/usr/lib"),
};
std::vector<char*> argv;
add_torch_library_path(envp, argv, false, updated_envs);
ASSERT_EQ(envp.size(), 1);
EXPECT_STREQ(envp[0], "LD_LIBRARY_PATH=/usr/lib");
for(auto* entry : envp)
std::free(entry);
}
TEST_F(AddTorchLibraryPathTest, HandlesNullArgvFront)
{
std::vector<char*> envp = {
strdup("LD_LIBRARY_PATH=/usr/lib"),
};
std::vector<char*> argv = { nullptr };
add_torch_library_path(envp, argv, false, updated_envs);
ASSERT_EQ(envp.size(), 1);
for(auto* entry : envp)
std::free(entry);
}
@@ -168,6 +168,14 @@ add_ld_library_path(parser_data& _data)
return _data; return _data;
} }
parser_data&
add_torch_library_path(parser_data& _data, bool verbose)
{
rocprofsys::common::add_torch_library_path(_data.current, _data.command, verbose,
_data.updated);
return _data;
}
parser_data& parser_data&
add_core_arguments(parser_t& _parser, parser_data& _data) add_core_arguments(parser_t& _parser, parser_data& _data)
{ {
@@ -83,6 +83,9 @@ add_ld_preload(parser_data&);
parser_data& parser_data&
add_ld_library_path(parser_data&); add_ld_library_path(parser_data&);
parser_data&
add_torch_library_path(parser_data&, bool verbose = false);
parser_data& parser_data&
add_core_arguments(parser_t&, parser_data&); add_core_arguments(parser_t&, parser_data&);
@@ -316,7 +316,9 @@ configure_settings(bool _init)
"backend", "perfetto"); "backend", "perfetto");
ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_TRACE_LEGACY", ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_TRACE_LEGACY",
"Use legacy direct mode for perfetto tracing instead of " "[DEPRECATED] The new default option is to use data from "
"cached buffer. When set to true system will use "
"legacy direct mode for perfetto tracing instead of "
"deferred trace generation. When false (default), uses " "deferred trace generation. When false (default), uses "
"cached mode with minimal runtime overhead.", "cached mode with minimal runtime overhead.",
false, "backend", "perfetto"); false, "backend", "perfetto");
@@ -1088,6 +1090,9 @@ configure_settings(bool _init)
handle_deprecated_setting("ROCPROFSYS_OUTPUT_FILE", "ROCPROFSYS_PERFETTO_FILE"); handle_deprecated_setting("ROCPROFSYS_OUTPUT_FILE", "ROCPROFSYS_PERFETTO_FILE");
handle_deprecated_setting("ROCPROFSYS_USE_PERFETTO", "ROCPROFSYS_TRACE"); handle_deprecated_setting("ROCPROFSYS_USE_PERFETTO", "ROCPROFSYS_TRACE");
handle_deprecated_setting("ROCPROFSYS_USE_TIMEMORY", "ROCPROFSYS_PROFILE"); handle_deprecated_setting("ROCPROFSYS_USE_TIMEMORY", "ROCPROFSYS_PROFILE");
handle_deprecated_setting("ROCPROFSYS_DEBUG", "ROCPROFSYS_LOG_LEVEL");
handle_deprecated_setting("ROCPROFSYS_VERBOSE", "ROCPROFSYS_LOG_LEVEL");
handle_deprecated_setting("ROCPROFSYS_TRACE_LEGACY", "ROCPROFSYS_TRACE");
scope::get_fields()[scope::flat::value] = _config->get_flat_profile(); scope::get_fields()[scope::flat::value] = _config->get_flat_profile();
scope::get_fields()[scope::timeline::value] = _config->get_timeline_profile(); scope::get_fields()[scope::timeline::value] = _config->get_timeline_profile();
@@ -31,6 +31,8 @@
} \ } \
} // namespace ::tim::cereal } // namespace ::tim::cereal
#include "common/defines.h"
#if !defined(ROCPROFSYS_USE_ROCM) #if !defined(ROCPROFSYS_USE_ROCM)
# define ROCPROFSYS_USE_ROCM 0 # define ROCPROFSYS_USE_ROCM 0
#endif #endif
@@ -40,7 +42,6 @@
#include <timemory/manager.hpp> #include <timemory/manager.hpp>
#include <dlfcn.h>
#include <string> #include <string>
#include "core/agent_manager.hpp" #include "core/agent_manager.hpp"
@@ -92,17 +93,6 @@ _amdsmi_is_initialized()
return initialized; return initialized;
} }
void
prevent_amdsmi_library_unload()
{
static bool _initialized = false;
if(_initialized) return;
_initialized = true;
dlopen("libamd_smi.so", RTLD_NOW | RTLD_NOLOAD | RTLD_NODELETE);
dlopen("librocm_smi64.so", RTLD_NOW | RTLD_NOLOAD | RTLD_NODELETE);
}
bool bool
amdsmi_init() amdsmi_init()
{ {
@@ -113,8 +103,6 @@ amdsmi_init()
ROCPROFSYS_AMD_SMI_CALL(::amdsmi_init(AMDSMI_INIT_AMD_GPUS)); ROCPROFSYS_AMD_SMI_CALL(::amdsmi_init(AMDSMI_INIT_AMD_GPUS));
get_processor_handles(); get_processor_handles();
_amdsmi_is_initialized() = true; // Mark as initialized _amdsmi_is_initialized() = true; // Mark as initialized
prevent_amdsmi_library_unload();
} catch(std::exception& _e) } catch(std::exception& _e)
{ {
LOG_ERROR("Exception thrown initializing amd-smi: {}", _e.what()); LOG_ERROR("Exception thrown initializing amd-smi: {}", _e.what());
@@ -367,7 +367,6 @@ config_settings(const std::shared_ptr<settings>& _config)
_skip_domains.emplace("kernel_dispatch"); _skip_domains.emplace("kernel_dispatch");
_skip_domains.emplace("page_migration"); _skip_domains.emplace("page_migration");
_skip_domains.emplace("scratch_memory");
_add_operation_settings( _add_operation_settings(
"MARKER_API", callback_tracing_info[ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API], "MARKER_API", callback_tracing_info[ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API],
@@ -652,7 +651,7 @@ get_backtrace_operations(rocprofiler_callback_tracing_kind_t kindv)
{ {
if(callback_operation_option_names.count(kindv) == 0) if(callback_operation_option_names.count(kindv) == 0)
{ {
LOG_CRITICAL("callback_operation_operation_names does not have value for {}", LOG_CRITICAL("callback_operation_option_names does not have value for {}",
static_cast<int>(kindv)); static_cast<int>(kindv));
::rocprofsys::set_state(::rocprofsys::State::Finalized); ::rocprofsys::set_state(::rocprofsys::State::Finalized);
std::abort(); std::abort();
@@ -41,7 +41,7 @@ using storage_parser_t =
storage_parser<type_identifier_t, kernel_dispatch_sample, memory_copy_sample, storage_parser<type_identifier_t, kernel_dispatch_sample, memory_copy_sample,
memory_allocate_sample, region_sample, in_time_sample, memory_allocate_sample, region_sample, in_time_sample,
pmc_event_with_sample, amd_smi_sample, cpu_freq_sample, pmc_event_with_sample, amd_smi_sample, cpu_freq_sample,
backtrace_region_sample>; backtrace_region_sample, scratch_memory_sample>;
using buffer_storage_t = buffer_storage<flush_worker_factory_t, type_identifier_t>; using buffer_storage_t = buffer_storage<flush_worker_factory_t, type_identifier_t>;
@@ -540,6 +540,72 @@ perfetto_processor_t::handle([[maybe_unused]] const kernel_dispatch_sample& _kds
#endif #endif
} }
void
perfetto_processor_t::handle([[maybe_unused]] const scratch_memory_sample& _sms)
{
#if ROCPROFSYS_USE_ROCM > 0
auto _corr_id = _sms.correlation_id_internal;
auto _stream_id = _sms.stream_handle;
auto _queue_id_handle = _sms.queue_id_handle;
const auto& _t_info = thread_info::get(_sms.thread_id, SystemTID);
const auto _thread_id_sequent = _t_info->index_data->sequent_value;
auto _beg_ts = _sms.start_timestamp;
auto _end_ts = _sms.end_timestamp;
auto _agent_device_id =
m_agent_manager.get_agent_by_handle(_sms.agent_id_handle).device_type_index;
auto _name = std::string{ m_metadata.get_buffer_name_info().at(
static_cast<rocprofiler_buffer_tracing_kind_t>(_sms.kind),
static_cast<rocprofiler_tracing_operation_t>(_sms.operation)) };
// Scratch memory samples from SDK versions prior to 7.0.2 do not include
// allocation_size field, so counter tracks are not needed
# if ROCPROFSYS_ROCM_VERSION >= 70002
using counter_track =
perfetto_counter_track<rocprofiler_buffer_tracing_scratch_memory_record_t>;
if(!counter_track::exists(_agent_device_id))
{
auto _track_desc_alloc_size = JOIN("", "GPU Scratch Memory [", _agent_device_id,
"] Thread ", _thread_id_sequent);
counter_track::emplace(_agent_device_id, _track_desc_alloc_size, "bytes");
}
if(_sms.operation == ROCPROFILER_SCRATCH_MEMORY_ALLOC)
{
TRACE_COUNTER("rocm_scratch_memory", counter_track::at(_agent_device_id, 0),
_beg_ts, _sms.allocation_size);
}
# endif
auto _track_desc_events = [&]() {
return JOIN("", "GPU Scratch Memory Events Thread ", _thread_id_sequent);
};
const auto _track =
tracing::get_perfetto_track(category::rocm_scratch_memory{}, _track_desc_events);
auto add_perfetto_annotations = [&](::perfetto::EventContext ctx) {
if(!m_use_annotations) return;
annotate_perfetto(ctx, { { "begin_ns", _beg_ts },
{ "end_ns", _end_ts },
{ "corr_id", _corr_id },
{ "stream_id", _stream_id },
{ "queue", _queue_id_handle },
{ "allocation_size", _sms.allocation_size },
{ "agent_id", _agent_device_id },
{ "operation", _name },
{ "flags", _sms.flags } });
};
tracing::push_perfetto(category::rocm_scratch_memory{}, _name.c_str(), _track,
_beg_ts, ::perfetto::Flow::ProcessScoped(_corr_id),
add_perfetto_annotations);
tracing::pop_perfetto(category::rocm_scratch_memory{}, "", _track, _end_ts);
#endif
}
void void
perfetto_processor_t::handle([[maybe_unused]] const memory_copy_sample& _mcs) perfetto_processor_t::handle([[maybe_unused]] const memory_copy_sample& _mcs)
{ {
@@ -56,6 +56,7 @@ public:
void finalize_processing(); void finalize_processing();
void handle(const kernel_dispatch_sample& sample); void handle(const kernel_dispatch_sample& sample);
void handle(const scratch_memory_sample& sample);
void handle(const memory_copy_sample& sample); void handle(const memory_copy_sample& sample);
void handle(const memory_allocate_sample& sample); void handle(const memory_allocate_sample& sample);
void handle(const region_sample& sample); void handle(const region_sample& sample);
@@ -66,6 +66,37 @@ get_handle_from_code_object(
# endif # endif
} }
#endif #endif
#if ROCPROFSYS_USE_ROCM > 0
using memory_operation = std::string;
using memory_type = std::string;
std::pair<memory_operation, memory_type>
parse_memory_operation_name(std::string_view memory_operation_name)
{
static const std::unordered_map<std::string_view,
std::pair<memory_operation, memory_type>>
parsing_map{
{ "MEMORY_ALLOCATION_NONE", { "NONE", "REAL" } },
{ "MEMORY_ALLOCATION_ALLOCATE", { "ALLOC", "REAL" } },
{ "MEMORY_ALLOCATION_VMEM_ALLOCATE", { "ALLOC", "VIRTUAL" } },
{ "MEMORY_ALLOCATION_FREE", { "FREE", "REAL" } },
{ "MEMORY_ALLOCATION_VMEM_FREE", { "FREE", "VIRTUAL" } },
{ "SCRATCH_MEMORY_NONE", { "NONE", "SCRATCH" } },
{ "SCRATCH_MEMORY_ALLOC", { "ALLOC", "SCRATCH" } },
{ "SCRATCH_MEMORY_FREE", { "FREE", "SCRATCH" } },
{ "SCRATCH_MEMORY_ASYNC_RECLAIM", { "ASYNC_RECLAIM", "SCRATCH" } },
};
auto item = parsing_map.find(memory_operation_name);
if(item == parsing_map.end())
{
LOG_WARNING("Unknown memory operation name: {}", memory_operation_name);
return { "UNKNOWN", "UNKNOWN" };
}
return item->second;
}
#endif
} // namespace } // namespace
void void
@@ -110,6 +141,46 @@ rocpd_processor_t::handle([[maybe_unused]] const kernel_dispatch_sample& _kds)
#endif #endif
} }
void
rocpd_processor_t::handle([[maybe_unused]] const scratch_memory_sample& _sms)
{
#if ROCPROFSYS_USE_ROCM > 0
auto& n_info = node_info::get_instance();
auto process = m_metadata->get_process_info();
const auto* _name = m_metadata->get_buffer_name_info().at(
static_cast<rocprofiler_buffer_tracing_kind_t>(_sms.kind),
static_cast<rocprofiler_tracing_operation_t>(_sms.operation));
auto agent_primary_key =
m_agent_manager->get_agent_by_handle(_sms.agent_id_handle).base_id;
auto thread_primary_key =
m_data_processor->map_thread_id_to_primary_key(_sms.thread_id);
auto category_primary_key = m_data_processor->insert_string(
trait::name<category::rocm_scratch_memory>::value);
auto stack_id = _sms.correlation_id_internal;
auto parent_stack_id = _sms.correlation_id_ancestor;
auto correlation_id = 0;
auto address_value = 0;
auto event_primary_key = m_data_processor->insert_event(
category_primary_key, stack_id, parent_stack_id, correlation_id);
auto [memory_operation, memory_type] = parse_memory_operation_name(_name);
auto extdata_json_str = JOIN("", "{\"flags\": ", _sms.flags, "}");
m_data_processor->insert_memory_alloc(
n_info.id, process.pid, thread_primary_key, agent_primary_key,
memory_operation.c_str(), memory_type.c_str(), _sms.start_timestamp,
_sms.end_timestamp, address_value, _sms.allocation_size, _sms.queue_id_handle,
_sms.stream_handle, event_primary_key, extdata_json_str.c_str());
#endif
}
void void
rocpd_processor_t::handle([[maybe_unused]] const memory_copy_sample& _mcs) rocpd_processor_t::handle([[maybe_unused]] const memory_copy_sample& _mcs)
{ {
@@ -153,46 +224,6 @@ void
rocpd_processor_t::handle([[maybe_unused]] const memory_allocate_sample& _mas) rocpd_processor_t::handle([[maybe_unused]] const memory_allocate_sample& _mas)
{ {
#if ROCPROFSYS_USE_ROCM > 0 && (ROCPROFILER_VERSION >= 600) #if ROCPROFSYS_USE_ROCM > 0 && (ROCPROFILER_VERSION >= 600)
static auto memtype_to_db =
[](std::string_view memory_type) -> std::pair<std::string, std::string> {
constexpr auto MEMORY_PREFIX = std::string_view{ "MEMORY_ALLOCATION_" };
constexpr auto SCRATCH_PREFIX = std::string_view{ "SCRATCH_MEMORY_" };
constexpr auto VMEM_PREFIX = std::string_view{ "VMEM_" };
constexpr auto ASYNC_PREFIX = std::string_view{ "ASYNC_" };
std::string _type;
std::string _level;
if(memory_type.find(MEMORY_PREFIX) == 0)
{
_type = memory_type.substr(MEMORY_PREFIX.length());
if(_type.find(VMEM_PREFIX) == 0)
{
_type = _type.substr(VMEM_PREFIX.length());
_level = "VIRTUAL";
}
else
{
_level = "REAL";
}
}
else if(memory_type.find(SCRATCH_PREFIX) == 0)
{
_type = memory_type.substr(SCRATCH_PREFIX.length());
_level = "SCRATCH";
if(memory_type.find(ASYNC_PREFIX) == 0)
{
_type = memory_type.substr(ASYNC_PREFIX.length()); // RECLAIM
}
}
if(_type == "ALLOCATE")
{
_type = "ALLOC";
}
return std::make_pair(_type, _level);
};
auto& n_info = node_info::get_instance(); auto& n_info = node_info::get_instance();
auto process = m_metadata->get_process_info(); auto process = m_metadata->get_process_info();
auto thread_primary_key = auto thread_primary_key =
@@ -210,7 +241,7 @@ rocpd_processor_t::handle([[maybe_unused]] const memory_allocate_sample& _mas)
static_cast<rocprofiler_buffer_tracing_kind_t>(_mas.kind), static_cast<rocprofiler_buffer_tracing_kind_t>(_mas.kind),
static_cast<rocprofiler_tracing_operation_t>(_mas.operation)); static_cast<rocprofiler_tracing_operation_t>(_mas.operation));
auto [type, level] = memtype_to_db(_name); auto [memory_operation, memory_type] = parse_memory_operation_name(_name);
auto stack_id = _mas.correlation_id_internal; auto stack_id = _mas.correlation_id_internal;
auto parent_stack_id = _mas.correlation_id_ancestor; auto parent_stack_id = _mas.correlation_id_ancestor;
@@ -224,9 +255,10 @@ rocpd_processor_t::handle([[maybe_unused]] const memory_allocate_sample& _mas)
category_primary_key, stack_id, parent_stack_id, correlation_id); category_primary_key, stack_id, parent_stack_id, correlation_id);
m_data_processor->insert_memory_alloc( m_data_processor->insert_memory_alloc(
n_info.id, process.pid, thread_primary_key, agent_primary_key, type.c_str(), n_info.id, process.pid, thread_primary_key, agent_primary_key,
level.c_str(), _mas.start_timestamp, _mas.end_timestamp, _mas.address_value, memory_operation.c_str(), memory_type.c_str(), _mas.start_timestamp,
_mas.allocation_size, queue_id, _mas.stream_handle, event_primary_key); _mas.end_timestamp, _mas.address_value, _mas.allocation_size, queue_id,
_mas.stream_handle, event_primary_key);
} }
#endif #endif
} }
@@ -45,6 +45,7 @@ public:
void finalize_processing(); void finalize_processing();
void handle(const kernel_dispatch_sample& sample); void handle(const kernel_dispatch_sample& sample);
void handle(const scratch_memory_sample& sample);
void handle(const memory_copy_sample& sample); void handle(const memory_copy_sample& sample);
void handle(const memory_allocate_sample& sample); void handle(const memory_allocate_sample& sample);
void handle(const region_sample& sample); void handle(const region_sample& sample);
@@ -43,6 +43,11 @@ struct processor_t
static_cast<T*>(this)->handle(sample); static_cast<T*>(this)->handle(sample);
} }
void handle(const scratch_memory_sample& sample)
{
static_cast<T*>(this)->handle(sample);
}
void handle(const memory_copy_sample& sample) void handle(const memory_copy_sample& sample)
{ {
static_cast<T*>(this)->handle(sample); static_cast<T*>(this)->handle(sample);
@@ -84,6 +89,7 @@ protected:
struct processor_view_t struct processor_view_t
{ {
using kernel_dispatch_fn_t = void (*)(void*, const kernel_dispatch_sample&) noexcept; using kernel_dispatch_fn_t = void (*)(void*, const kernel_dispatch_sample&) noexcept;
using scratch_memory_fn_t = void (*)(void*, const scratch_memory_sample&) noexcept;
using memory_copy_fn_t = void (*)(void*, const memory_copy_sample&) noexcept; using memory_copy_fn_t = void (*)(void*, const memory_copy_sample&) noexcept;
#if(ROCPROFILER_VERSION >= 600) #if(ROCPROFILER_VERSION >= 600)
using memory_allocate_fn_t = void (*)(void*, const memory_allocate_sample&) noexcept; using memory_allocate_fn_t = void (*)(void*, const memory_allocate_sample&) noexcept;
@@ -101,6 +107,7 @@ struct processor_view_t
struct vtable_t struct vtable_t
{ {
kernel_dispatch_fn_t handle_kernel_dispatch; kernel_dispatch_fn_t handle_kernel_dispatch;
scratch_memory_fn_t handle_scratch_memory;
memory_copy_fn_t handle_memory_copy; memory_copy_fn_t handle_memory_copy;
#if(ROCPROFILER_VERSION >= 600) #if(ROCPROFILER_VERSION >= 600)
memory_allocate_fn_t handle_memory_allocate; memory_allocate_fn_t handle_memory_allocate;
@@ -134,6 +141,11 @@ struct processor_view_t
m_vtable->handle_kernel_dispatch(m_object, sample); m_vtable->handle_kernel_dispatch(m_object, sample);
} }
ROCPROFSYS_INLINE void handle(const scratch_memory_sample& sample) const noexcept
{
m_vtable->handle_scratch_memory(m_object, sample);
}
ROCPROFSYS_INLINE void handle(const memory_copy_sample& sample) const noexcept ROCPROFSYS_INLINE void handle(const memory_copy_sample& sample) const noexcept
{ {
m_vtable->handle_memory_copy(m_object, sample); m_vtable->handle_memory_copy(m_object, sample);
@@ -194,6 +206,9 @@ private:
+[](void* obj, const kernel_dispatch_sample& sample) noexcept { +[](void* obj, const kernel_dispatch_sample& sample) noexcept {
static_cast<T*>(obj)->handle(sample); static_cast<T*>(obj)->handle(sample);
}, },
+[](void* obj, const scratch_memory_sample& sample) noexcept {
static_cast<T*>(obj)->handle(sample);
},
+[](void* obj, const memory_copy_sample& sample) noexcept { +[](void* obj, const memory_copy_sample& sample) noexcept {
static_cast<T*>(obj)->handle(sample); static_cast<T*>(obj)->handle(sample);
}, },
@@ -275,6 +290,9 @@ struct sample_processor_t
case type_identifier_t::kernel_dispatch: case type_identifier_t::kernel_dispatch:
handle_sample(static_cast<const kernel_dispatch_sample&>(sample)); handle_sample(static_cast<const kernel_dispatch_sample&>(sample));
break; break;
case type_identifier_t::scratch_memory:
handle_sample(static_cast<const scratch_memory_sample&>(sample));
break;
case type_identifier_t::memory_copy: case type_identifier_t::memory_copy:
handle_sample(static_cast<const memory_copy_sample&>(sample)); handle_sample(static_cast<const memory_copy_sample&>(sample));
break; break;
@@ -46,6 +46,7 @@ enum class type_identifier_t : uint32_t
amd_smi_sample = 0x0006, amd_smi_sample = 0x0006,
cpu_freq_sample = 0x0007, cpu_freq_sample = 0x0007,
backtrace_region_sample = 0x0008, backtrace_region_sample = 0x0008,
scratch_memory = 0x0009,
fragmented_space = 0xFFFF fragmented_space = 0xFFFF
}; };
@@ -148,6 +149,83 @@ get_size(const kernel_dispatch_sample& item)
item.grid_size_z, static_cast<uint64_t>(item.stream_handle)); item.grid_size_z, static_cast<uint64_t>(item.stream_handle));
} }
struct scratch_memory_sample : cacheable_t
{
static constexpr type_identifier_t type_identifier =
type_identifier_t::scratch_memory;
scratch_memory_sample() = default;
scratch_memory_sample(uint64_t _start_timestamp, uint64_t _end_timestamp,
uint64_t _thread_id, uint64_t _agent_id_handle,
uint64_t _queue_id_handle, int32_t _kind, int32_t _operation,
int32_t _flags, uint64_t _allocation_size,
uint64_t _correlation_id_internal,
uint64_t _correlation_id_ancestor, size_t _stream_handle)
: start_timestamp(_start_timestamp)
, end_timestamp(_end_timestamp)
, thread_id(_thread_id)
, agent_id_handle(_agent_id_handle)
, queue_id_handle(_queue_id_handle)
, kind(_kind)
, operation(_operation)
, flags(_flags)
, allocation_size(_allocation_size)
, correlation_id_internal(_correlation_id_internal)
, correlation_id_ancestor(_correlation_id_ancestor)
, stream_handle(_stream_handle)
{}
uint64_t start_timestamp;
uint64_t end_timestamp;
uint64_t thread_id;
uint64_t agent_id_handle;
uint64_t queue_id_handle;
int32_t kind;
int32_t operation;
int32_t flags;
uint64_t allocation_size;
uint64_t correlation_id_internal;
uint64_t correlation_id_ancestor;
size_t stream_handle;
};
template <>
inline void
serialize(uint8_t* buffer, const scratch_memory_sample& item)
{
utility::store_value(buffer, item.start_timestamp, item.end_timestamp, item.thread_id,
item.agent_id_handle, item.queue_id_handle, item.kind,
item.operation, item.flags, item.allocation_size,
item.correlation_id_internal, item.correlation_id_ancestor,
static_cast<uint64_t>(item.stream_handle));
}
template <>
inline scratch_memory_sample
deserialize(uint8_t*& buffer)
{
scratch_memory_sample item;
uint64_t stream_handle;
utility::parse_value(buffer, item.start_timestamp, item.end_timestamp, item.thread_id,
item.agent_id_handle, item.queue_id_handle, item.kind,
item.operation, item.flags, item.allocation_size,
item.correlation_id_internal, item.correlation_id_ancestor,
stream_handle);
item.stream_handle = stream_handle;
return item;
}
template <>
inline size_t
get_size(const scratch_memory_sample& item)
{
return utility::get_size(item.start_timestamp, item.end_timestamp, item.thread_id,
item.agent_id_handle, item.queue_id_handle, item.kind,
item.operation, item.flags, item.allocation_size,
item.correlation_id_internal, item.correlation_id_ancestor,
static_cast<uint64_t>(item.stream_handle));
}
struct memory_copy_sample : cacheable_t struct memory_copy_sample : cacheable_t
{ {
static constexpr type_identifier_t type_identifier = type_identifier_t::memory_copy; static constexpr type_identifier_t type_identifier = type_identifier_t::memory_copy;
@@ -1272,7 +1272,10 @@ shutdown()
try try
{ {
data::shutdown(); if(data::shutdown())
{
ROCPROFSYS_AMD_SMI_CALL(amdsmi_shut_down());
}
} catch(std::runtime_error& _e) } catch(std::runtime_error& _e)
{ {
LOG_WARNING("Exception thrown when shutting down amd-smi: {}", _e.what()); LOG_WARNING("Exception thrown when shutting down amd-smi: {}", _e.what());
@@ -24,6 +24,7 @@
#include "core/common.hpp" #include "core/common.hpp"
#include "core/config.hpp" #include "core/config.hpp"
#include "core/state.hpp" #include "core/state.hpp"
#include "core/timemory.hpp"
#include "library/runtime.hpp" #include "library/runtime.hpp"
#include <timemory/backends/threading.hpp> #include <timemory/backends/threading.hpp>
@@ -33,9 +34,7 @@
#include "logger/debug.hpp" #include "logger/debug.hpp"
#include <cstddef> #include <cstddef>
#include <cstdio>
#include <cstdlib> #include <cstdlib>
#include <unistd.h>
namespace rocprofsys namespace rocprofsys
{ {
@@ -89,26 +88,6 @@ void
exit_gotcha::operator()(const gotcha_data& _data, exit_func_t _func, int _ec) const exit_gotcha::operator()(const gotcha_data& _data, exit_func_t _func, int _ec) const
{ {
_exit_info = { true, _data.tool_id.find("quick") != std::string::npos, _ec }; _exit_info = { true, _data.tool_id.find("quick") != std::string::npos, _ec };
if(config::get_use_amd_smi())
{
threading::clear_callbacks();
if(get_state() < ::rocprofsys::State::Finalized && !is_child_process())
{
LOG_DEBUG("Finalizing {} before calling {}({})...", get_exe_name(),
_data.tool_id, _ec);
rocprofsys_finalize();
}
LOG_DEBUG("Calling _exit({}) in {} to avoid AMD SMI cleanup issues...", _ec,
get_exe_name().c_str());
std::fflush(nullptr);
_exit(_ec);
}
invoke_exit_gotcha(_data, _func, _ec); invoke_exit_gotcha(_data, _func, _ec);
} }
@@ -567,6 +567,18 @@ get_mem_alloc_address(
} }
#endif #endif
uint64_t
get_scratch_mem_alloc_size(
[[maybe_unused]] const rocprofiler_buffer_tracing_scratch_memory_record_t& record)
{
// Scratch memory samples from SDK versions prior to 7.0.2 do not include allocation_size
#if(ROCPROFSYS_USE_ROCM > 0 && ROCPROFSYS_ROCM_VERSION >= 70002)
return record.allocation_size;
#else
return 0;
#endif
}
void void
cache_region(const rocprofiler_callback_tracing_record_t* record, cache_region(const rocprofiler_callback_tracing_record_t* record,
const rocprofiler_timestamp_t start_timestamp, const rocprofiler_timestamp_t start_timestamp,
@@ -615,13 +627,26 @@ cache_kernel_dispatch(rocprofiler_buffer_tracing_kernel_dispatch_record_t* recor
record->dispatch_info.grid_size.z, stream_handle }); record->dispatch_info.grid_size.z, stream_handle });
} }
void
cache_scratch_memory(rocprofiler_buffer_tracing_scratch_memory_record_t* record,
uint64_t stream_handle)
{
trace_cache::get_metadata_registry().add_stream(stream_handle);
trace_cache::get_buffer_storage().store(trace_cache::scratch_memory_sample{
record->start_timestamp, record->end_timestamp, record->thread_id,
record->agent_id.handle, record->queue_id.handle,
static_cast<int32_t>(record->kind), static_cast<int32_t>(record->operation),
static_cast<int32_t>(record->flags), get_scratch_mem_alloc_size(*record),
record->correlation_id.internal, get_parent_stack_id(record->correlation_id),
stream_handle });
}
void void
cache_memory_copy(rocprofiler_buffer_tracing_memory_copy_record_t* record, cache_memory_copy(rocprofiler_buffer_tracing_memory_copy_record_t* record,
uint64_t stream_handle) uint64_t stream_handle)
{ {
trace_cache::get_metadata_registry().add_stream(stream_handle); trace_cache::get_metadata_registry().add_stream(stream_handle);
trace_cache::get_buffer_storage().store(trace_cache::memory_copy_sample{ trace_cache::get_buffer_storage().store(trace_cache::memory_copy_sample{
record->start_timestamp, record->end_timestamp, record->thread_id, record->start_timestamp, record->end_timestamp, record->thread_id,
record->dst_agent_id.handle, record->src_agent_id.handle, record->dst_agent_id.handle, record->src_agent_id.handle,
static_cast<int32_t>(record->kind), static_cast<int32_t>(record->operation), static_cast<int32_t>(record->kind), static_cast<int32_t>(record->operation),
@@ -1759,6 +1784,120 @@ tool_tracing_buffered(rocprofiler_context_id_t /*context*/,
} }
} }
} }
else if(header->kind == ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY)
{
auto* record =
static_cast<rocprofiler_buffer_tracing_scratch_memory_record_t*>(
header->payload);
bool _group_by_queue = _default_group_by_queue;
const auto* agent = tool_data->get_gpu_tool_agent(record->agent_id);
auto device_id = static_cast<uint32_t>(agent->device_id);
const auto& t_info = thread_info::get(record->thread_id, SystemTID);
auto thread_id_sequent = t_info->index_data->sequent_value;
auto _corr_id = record->correlation_id.internal;
auto _beg_ns = record->start_timestamp;
auto _end_ns = record->end_timestamp;
auto _name =
tool_data->buffered_tracing_info.at(record->kind, record->operation);
auto _stream_id = get_stream_id(record).handle;
if(_stream_id == 0)
{
// Scratch memory event is not associated with a HIP stream
_group_by_queue = true;
}
{
auto track_name = JOIN("", "GPU Scratch Memory [", device_id,
"] Thread ", record->thread_id);
cache_category<category::rocm_scratch_memory>();
cache_add_thread_info(record->thread_id);
cache_add_track(track_name.c_str(), record->thread_id);
cache_scratch_memory(record, _stream_id);
}
if(get_use_timemory())
{
auto _bundle = kernel_dispatch_bundle_t{ _name };
_bundle.push(thread_id_sequent).start().stop();
_bundle.get([_beg_ns, _end_ns](tim::component::wall_clock* _wc) {
_wc->set_value(_end_ns - _beg_ns);
_wc->set_accum(_end_ns - _beg_ns);
});
_bundle.pop();
}
if(get_use_perfetto())
{
// Scratch memory samples from SDK versions prior to 7.0.2 do not include
// allocation_size field, so counter tracks are not needed
#if(ROCPROFSYS_USE_ROCM > 0 && ROCPROFSYS_ROCM_VERSION >= 70002)
using counter_track = perfetto_counter_track<
rocprofiler_buffer_tracing_scratch_memory_record_t>;
if(!counter_track::exists(device_id))
{
auto track_name_alloc_size =
JOIN("", "GPU Scratch Memory [", device_id, "] (S) Thread ",
thread_id_sequent);
counter_track::emplace(device_id, track_name_alloc_size, "bytes");
}
if(record->operation == ROCPROFILER_SCRATCH_MEMORY_ALLOC)
{
TRACE_COUNTER("rocm_scratch_memory",
counter_track::at(device_id, 0), _beg_ns,
record->allocation_size);
}
#endif
auto add_perfetto_annotations = [&](::perfetto::EventContext ctx) {
if(config::get_perfetto_annotations())
{
tracing::add_perfetto_annotation(ctx, "begin_ns", _beg_ns);
tracing::add_perfetto_annotation(ctx, "end_ns", _end_ns);
tracing::add_perfetto_annotation(ctx, "corr_id", _corr_id);
tracing::add_perfetto_annotation(ctx, "stream_id",
_stream_id);
}
};
if(_group_by_queue)
{
auto track_name_events = [&]() {
return JOIN("", "GPU Scratch Memory (S) Events Thread ",
thread_id_sequent);
};
const auto _track = tracing::get_perfetto_track(
category::rocm_scratch_memory{}, track_name_events);
tracing::push_perfetto(category::rocm_scratch_memory{},
_name.data(), _track, _beg_ns,
::perfetto::Flow::ProcessScoped(_corr_id),
add_perfetto_annotations);
tracing::pop_perfetto(category::rocm_scratch_memory{}, "", _track,
_end_ns);
}
else
{
const auto _track = tracing::get_perfetto_track(
category::rocm_hip_stream{}, _track_desc_stream, _stream_id);
tracing::push_perfetto(category::rocm_hip_stream{}, _name.data(),
_track, _beg_ns,
::perfetto::Flow::ProcessScoped(_corr_id),
add_perfetto_annotations);
tracing::pop_perfetto(category::rocm_hip_stream{}, "", _track,
_end_ns);
}
}
}
else if(header->kind == ROCPROFILER_BUFFER_TRACING_MEMORY_COPY) else if(header->kind == ROCPROFILER_BUFFER_TRACING_MEMORY_COPY)
{ {
auto* record = auto* record =
@@ -2249,6 +2388,17 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* user_data)
_data->primary_ctx, ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, nullptr, 0, _data->primary_ctx, ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, nullptr, 0,
_data->memory_copy_buffer)); _data->memory_copy_buffer));
} }
if(_buffered_domain.count(ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY) > 0)
{
ROCPROFILER_CALL(rocprofiler_create_buffer(
_data->primary_ctx, buffer_size, watermark,
ROCPROFILER_BUFFER_POLICY_LOSSLESS, tool_tracing_buffered, tool_data,
&_data->scratch_memory_buffer));
ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service(
_data->primary_ctx, ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY, nullptr, 0,
_data->scratch_memory_buffer));
}
#if(ROCPROFILER_VERSION >= 600) #if(ROCPROFILER_VERSION >= 600)
if(_buffered_domain.count(ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION) > 0) if(_buffered_domain.count(ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION) > 0)
@@ -122,7 +122,7 @@ using backtrace_operation_map_t =
struct client_data struct client_data
{ {
static constexpr size_t num_buffers = 4; static constexpr size_t num_buffers = 5;
static constexpr size_t num_contexts = 2; static constexpr size_t num_contexts = 2;
using buffer_name_info_t = rocprofiler::sdk::buffer_name_info_t<std::string_view>; using buffer_name_info_t = rocprofiler::sdk::buffer_name_info_t<std::string_view>;
@@ -138,6 +138,7 @@ struct client_data
rocprofiler_context_id_t primary_ctx = { 0 }; rocprofiler_context_id_t primary_ctx = { 0 };
rocprofiler_context_id_t counter_ctx = { 0 }; rocprofiler_context_id_t counter_ctx = { 0 };
rocprofiler_buffer_id_t kernel_dispatch_buffer = { 0 }; rocprofiler_buffer_id_t kernel_dispatch_buffer = { 0 };
rocprofiler_buffer_id_t scratch_memory_buffer = { 0 };
rocprofiler_buffer_id_t memory_copy_buffer = { 0 }; rocprofiler_buffer_id_t memory_copy_buffer = { 0 };
rocprofiler_buffer_id_t memory_alloc_buffer = { 0 }; rocprofiler_buffer_id_t memory_alloc_buffer = { 0 };
rocprofiler_buffer_id_t counter_collection_buffer = { 0 }; rocprofiler_buffer_id_t counter_collection_buffer = { 0 };
@@ -179,12 +180,9 @@ client_data::get_contexts() const
inline client_data::buffer_id_vec_t inline client_data::buffer_id_vec_t
client_data::get_buffers() const client_data::get_buffers() const
{ {
return buffer_id_vec_t{ return buffer_id_vec_t{ kernel_dispatch_buffer, scratch_memory_buffer,
kernel_dispatch_buffer, memory_copy_buffer, memory_alloc_buffer,
memory_copy_buffer, counter_collection_buffer };
memory_alloc_buffer,
counter_collection_buffer,
};
} }
inline const rocprofsys_agent_t* inline const rocprofsys_agent_t*
@@ -47,7 +47,6 @@ from .parser import parse_uploaded_file
from .parser import find_causal_files from .parser import find_causal_files
import plotly.graph_objects as go import plotly.graph_objects as go
file_timestamp = 0 file_timestamp = 0
global_data = pd.DataFrame() global_data = pd.DataFrame()
global_samples = pd.DataFrame() global_samples = pd.DataFrame()
@@ -37,7 +37,6 @@ from . import libpyrocprofsys
from .libpyrocprofsys.profiler import profiler_init as _profiler_init from .libpyrocprofsys.profiler import profiler_init as _profiler_init
from .libpyrocprofsys.profiler import profiler_finalize as _profiler_fini from .libpyrocprofsys.profiler import profiler_finalize as _profiler_fini
__all__ = ["exec_", "_file", "_get_argv", "_initialize", "_finalize"] __all__ = ["exec_", "_file", "_get_argv", "_initialize", "_finalize"]
@@ -44,7 +44,6 @@ from .libpyrocprofsys.user import pop_region
from .common import _initialize from .common import _initialize
from .common import _file from .common import _file
__all__ = [ __all__ = [
"region", "region",
"Region", "Region",

Some files were not shown because too many files have changed in this diff Show More