Merge branch 'develop' into amd/dev/gandryey/SWDEV-558849
Esse commit está contido em:
@@ -41,7 +41,7 @@ jobs:
|
||||
working-directory: projects/rocprofiler-compute
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
python -m pip install ruff
|
||||
python -m pip install ruff==0.14.11
|
||||
if [ -f requirements.txt ]; then python -m pip install -r requirements.txt; fi
|
||||
- name: Run Ruff Linter and Import Sorter
|
||||
run: |
|
||||
|
||||
@@ -226,6 +226,12 @@ class AMDSMICommands():
|
||||
# Set args.* to passed in arguments
|
||||
if gpu:
|
||||
args.gpu = gpu
|
||||
|
||||
cpu_attributes = ["cpu"]
|
||||
for attr in cpu_attributes:
|
||||
if hasattr(args, 'cpu') and getattr(args, 'cpu'):
|
||||
print("N/A")
|
||||
return
|
||||
|
||||
# Handle No GPU passed
|
||||
if args.gpu == None:
|
||||
|
||||
@@ -8,6 +8,7 @@ Full documentation for HIP is available at [rocm.docs.amd.com](https://rocm.docs
|
||||
|
||||
* New HIP APIs
|
||||
- `hipKernelGetParamInfo` returns the offset and size of a kernel parameter
|
||||
* Support for `barrier_arrive` and `barrier_wait` for `grid_group` and `thread_block`.
|
||||
|
||||
* New HIP supports
|
||||
- `grid_group::block_rank()` returns the rank of the block in the calling thread
|
||||
|
||||
@@ -111,9 +111,8 @@
|
||||
#if !defined(__HIPCC_RTC__)
|
||||
#include <hip/amd_detail/amd_hip_common.h>
|
||||
#include "amd_hip_vector_types.h" // float2 etc
|
||||
#include "device_library_decls.h" // ocml conversion functions
|
||||
#include "math_fwd.h" // ocml device functions
|
||||
#if defined(__clang__) && defined(__HIP__)
|
||||
#include "math_fwd.h" // ocml device functions
|
||||
#include <hip/amd_detail/amd_warp_functions.h> // define warpSize
|
||||
#include <hip/amd_detail/amd_warp_sync_functions.h> // Sync functions
|
||||
#endif
|
||||
@@ -338,7 +337,11 @@ struct __attribute__((aligned(2))) __hip_bfloat16 {
|
||||
};
|
||||
/**@}*/
|
||||
|
||||
#if defined(__clang__)
|
||||
typedef __bf16 __bf16_2 __attribute__((ext_vector_type(2)));
|
||||
#else
|
||||
typedef __bf16 __bf16_2 __attribute__((vector_size(sizeof(__bf16) * 2)));
|
||||
#endif
|
||||
|
||||
/**
|
||||
* \defgroup HIP_INTRINSIC_BFLOAT162_STRUCT
|
||||
@@ -350,6 +353,7 @@ struct __attribute__((aligned(4))) __hip_bfloat162 {
|
||||
static_assert(sizeof(__hip_bfloat16[2]) == sizeof(__bf16_2));
|
||||
|
||||
public:
|
||||
#if defined(__clang__)
|
||||
union {
|
||||
struct {
|
||||
__hip_bfloat16 x; /*! \brief raw representation of bfloat16 */
|
||||
@@ -357,7 +361,12 @@ struct __attribute__((aligned(4))) __hip_bfloat162 {
|
||||
};
|
||||
__bf16_2 __xy_bf162;
|
||||
};
|
||||
|
||||
#else
|
||||
/* GCC does not support anonymous structs with members that have non-trivial constructors (Clang
|
||||
allows this as an extension). Expose x and y directly instead. */
|
||||
__hip_bfloat16 x;
|
||||
__hip_bfloat16 y;
|
||||
#endif
|
||||
|
||||
public:
|
||||
/*! \brief create __hip_bfloat162 from __hip_bfloat162_raw */
|
||||
@@ -373,7 +382,11 @@ struct __attribute__((aligned(4))) __hip_bfloat162 {
|
||||
: x(a), y(b) {}
|
||||
|
||||
/*! \brief create __hip_bfloat162 from vector of __bf16_2 */
|
||||
#if defined(__clang__)
|
||||
__BF16_HOST_DEVICE__ __hip_bfloat162(const __bf16_2 in) : __xy_bf162(in) {}
|
||||
#else
|
||||
__BF16_HOST_DEVICE__ __hip_bfloat162(const __bf16_2 in) : x{in[0]}, y{in[1]} {}
|
||||
#endif
|
||||
|
||||
/*! \brief default constructor of __hip_bfloat162 */
|
||||
__BF16_HOST_DEVICE__ __hip_bfloat162() = default;
|
||||
@@ -392,11 +405,22 @@ struct __attribute__((aligned(4))) __hip_bfloat162 {
|
||||
}
|
||||
|
||||
/*! \brief return a vector of bf16 */
|
||||
__BF16_HOST_DEVICE__ operator __bf16_2() const { return __xy_bf162; }
|
||||
__BF16_HOST_DEVICE__ operator __bf16_2() const {
|
||||
#if defined(__clang__)
|
||||
return __xy_bf162;
|
||||
#else
|
||||
return __bf16_2{x, y};
|
||||
#endif
|
||||
}
|
||||
|
||||
/*! \brief return a vector of bf16 */
|
||||
__BF16_HOST_DEVICE__ __hip_bfloat162& operator=(const __bf16_2 in) {
|
||||
#if defined(__clang__)
|
||||
__xy_bf162 = in;
|
||||
#else
|
||||
x = __hip_bfloat16{in[0]};
|
||||
y = __hip_bfloat16{in[1]};
|
||||
#endif
|
||||
return *this;
|
||||
}
|
||||
|
||||
@@ -835,6 +859,7 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hdiv(const __hip_bfloat16 a, const
|
||||
return (__bf16)a / (__bf16)b;
|
||||
}
|
||||
|
||||
#if defined(__clang__) && defined(__HIP__)
|
||||
/**
|
||||
* \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
|
||||
* \brief Performs FMA of given bfloat16 values
|
||||
@@ -844,6 +869,7 @@ __BF16_DEVICE_STATIC__ __hip_bfloat16 __hfma(const __hip_bfloat16 a, const __hip
|
||||
return __hip_bfloat16(__builtin_elementwise_fma(__bf16(a), __bf16(b), __bf16(c)));
|
||||
;
|
||||
}
|
||||
#endif
|
||||
|
||||
/**
|
||||
* \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
|
||||
@@ -919,6 +945,8 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hadd2_rn(const __hip_bfloat162 a,
|
||||
return __hip_bfloat162{__bf16_2(a) + __bf16_2(b)};
|
||||
}
|
||||
|
||||
|
||||
#if defined(__clang__) && defined(__HIP__)
|
||||
/**
|
||||
* \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
|
||||
* \brief Performs FMA of given bfloat162 values
|
||||
@@ -927,6 +955,7 @@ __BF16_DEVICE_STATIC__ __hip_bfloat162 __hfma2(const __hip_bfloat162 a, const __
|
||||
const __hip_bfloat162 c) {
|
||||
return __hip_bfloat162{__builtin_elementwise_fma(__bf16_2(a), __bf16_2(b), __bf16_2(c))};
|
||||
}
|
||||
#endif
|
||||
|
||||
/**
|
||||
* \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
|
||||
@@ -1639,6 +1668,7 @@ __BF16_HOST_DEVICE_STATIC__ bool operator>=(const __hip_bfloat162& l, const __hi
|
||||
return fl.x >= fr.x && fl.x >= fr.y;
|
||||
}
|
||||
|
||||
#if defined(__clang__) && defined(__HIP__)
|
||||
/**
|
||||
* \ingroup HIP_INTRINSIC_BFLOAT16_MATH
|
||||
* \brief Calculate ceil of bfloat16
|
||||
@@ -1883,7 +1913,6 @@ __BF16_DEVICE_STATIC__ __hip_bfloat162 h2trunc(const __hip_bfloat162 h) {
|
||||
return __hip_bfloat162(htrunc(h.x), htrunc(h.y));
|
||||
}
|
||||
|
||||
#if defined(__clang__) && defined(__HIP__)
|
||||
/**
|
||||
* \ingroup HIP_INTRINSIC_BFLOAT162_MATH
|
||||
* \brief Atomic add bfloat162
|
||||
|
||||
@@ -212,6 +212,19 @@ class grid_group : public thread_group {
|
||||
//! @copydoc thread_group::sync
|
||||
__CG_QUALIFIER__ void sync() const { internal::grid::sync(); }
|
||||
__CG_QUALIFIER__ dim3 group_dim() const { return internal::grid::grid_dim(); }
|
||||
struct arrival_token {
|
||||
unsigned int signal;
|
||||
};
|
||||
//! Arrive at a barrier
|
||||
__CG_QUALIFIER__ arrival_token barrier_arrive() const {
|
||||
arrival_token t;
|
||||
t.signal = internal::grid::barrier_signal();
|
||||
return t;
|
||||
}
|
||||
//! Arrive at a barrier
|
||||
__CG_QUALIFIER__ void barrier_wait(arrival_token&& t) const {
|
||||
internal::grid::barrier_wait(t.signal);
|
||||
}
|
||||
};
|
||||
|
||||
/** \ingroup CooperativeGConstruct
|
||||
@@ -295,6 +308,14 @@ class thread_block : public thread_group {
|
||||
__CG_STATIC_QUALIFIER__ void sync() { internal::workgroup::sync(); }
|
||||
//! Returns the group dimensions.
|
||||
__CG_QUALIFIER__ dim3 group_dim() { return internal::workgroup::block_dim(); }
|
||||
struct arrival_token {};
|
||||
//! Arrive at a barrier
|
||||
__CG_QUALIFIER__ arrival_token barrier_arrive() const {
|
||||
internal::workgroup::barrier_arrive();
|
||||
return arrival_token{};
|
||||
}
|
||||
//! Arrive at a barrier
|
||||
__CG_QUALIFIER__ void barrier_wait(arrival_token&&) const { internal::workgroup::barrier_wait(); }
|
||||
};
|
||||
|
||||
/** \ingroup CooperativeGConstruct
|
||||
|
||||
@@ -97,6 +97,8 @@ extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_size(void);
|
||||
extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_thread_rank(void);
|
||||
extern "C" __device__ __attribute__((const)) int __ockl_multi_grid_is_valid(void);
|
||||
extern "C" __device__ __attribute__((convergent)) void __ockl_multi_grid_sync(void);
|
||||
extern "C" __device__ __attribute__((const)) uint __ockl_grid_bar_arrive(void);
|
||||
extern "C" __device__ __attribute__((convergent)) void __ockl_grid_bar_wait(uint);
|
||||
|
||||
extern "C" __device__ void __ockl_atomic_add_noret_f32(float*, float);
|
||||
|
||||
|
||||
@@ -63,7 +63,7 @@
|
||||
#define HIP_API_TABLE_STEP_VERSION 0
|
||||
#define HIP_COMPILER_API_TABLE_STEP_VERSION 0
|
||||
#define HIP_TOOLS_API_TABLE_STEP_VERSION 0
|
||||
#define HIP_RUNTIME_API_TABLE_STEP_VERSION 20
|
||||
#define HIP_RUNTIME_API_TABLE_STEP_VERSION 21
|
||||
|
||||
// HIP API interface
|
||||
// HIP compiler dispatch functions
|
||||
@@ -1113,6 +1113,9 @@ typedef hipError_t (*t_hipKernelGetLibrary)(hipLibrary_t* library, hipKernel_t k
|
||||
typedef hipError_t (*t_hipKernelGetName)(const char** name, hipKernel_t kernel);
|
||||
typedef hipError_t (*t_hipGetProcAddress_spt)(const char* symbol, void** pfn, int hipVersion, uint64_t flags,
|
||||
hipDriverProcAddressQueryResult* symbolStatus);
|
||||
typedef hipError_t (*t_hipExtDisableLogging)();
|
||||
typedef hipError_t (*t_hipExtEnableLogging)();
|
||||
typedef hipError_t (*t_hipExtSetLoggingParams)(size_t log_level, size_t log_size, size_t log_mask);
|
||||
|
||||
typedef hipError_t (*t_hipKernelGetParamInfo)(hipKernel_t kernel, size_t paramIndex,
|
||||
size_t* paramOffset, size_t* paramSize);
|
||||
@@ -1707,8 +1710,13 @@ struct HipDispatchTable {
|
||||
// HIP_RUNTIME_API_TABLE_STEP_VERSION == 20
|
||||
t_hipKernelGetParamInfo hipKernelGetParamInfo_fn;
|
||||
|
||||
// DO NOT EDIT ABOVE!
|
||||
// HIP_RUNTIME_API_TABLE_STEP_VERSION == 21
|
||||
t_hipExtDisableLogging hipExtDisableLogging_fn;
|
||||
t_hipExtEnableLogging hipExtEnableLogging_fn;
|
||||
t_hipExtSetLoggingParams hipExtSetLoggingParams_fn;
|
||||
|
||||
// DO NOT EDIT ABOVE!
|
||||
// HIP_RUNTIME_API_TABLE_STEP_VERSION == 22
|
||||
|
||||
// ******************************************************************************************* //
|
||||
//
|
||||
|
||||
@@ -196,6 +196,9 @@ __CG_STATIC_QUALIFIER__ dim3 grid_dim() {
|
||||
static_cast<__hip_uint32_t>(gridDim.z)));
|
||||
}
|
||||
|
||||
__CG_STATIC_QUALIFIER__ unsigned int barrier_signal() { return __ockl_grid_bar_arrive(); }
|
||||
|
||||
__CG_STATIC_QUALIFIER__ void barrier_wait(unsigned int s) { __ockl_grid_bar_wait(s); }
|
||||
} // namespace grid
|
||||
|
||||
/**
|
||||
@@ -238,6 +241,23 @@ __CG_STATIC_QUALIFIER__ dim3 block_dim() {
|
||||
static_cast<__hip_uint32_t>(blockDim.z)));
|
||||
}
|
||||
|
||||
__CG_STATIC_QUALIFIER__ void barrier_arrive() {
|
||||
__builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
|
||||
#if __has_builtin(__builtin_amdgcn_s_barrier_signal) && \
|
||||
__has_builtin(__builtin_amdgcn_s_barrier_wait)
|
||||
__builtin_amdgcn_s_barrier_signal(-1);
|
||||
#endif // __builtin_amdgcn_s_barrier_signal && __builtin_amdgcn_s_barrier_wait
|
||||
}
|
||||
|
||||
__CG_STATIC_QUALIFIER__ void barrier_wait() {
|
||||
#if __has_builtin(__builtin_amdgcn_s_barrier_signal) && \
|
||||
__has_builtin(__builtin_amdgcn_s_barrier_wait)
|
||||
__builtin_amdgcn_s_barrier_wait(-1);
|
||||
#else
|
||||
__builtin_amdgcn_s_barrier();
|
||||
#endif // __builtin_amdgcn_s_barrier_signal && __builtin_amdgcn_s_barrier_wait
|
||||
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
|
||||
}
|
||||
} // namespace workgroup
|
||||
|
||||
namespace tiled_group {
|
||||
|
||||
@@ -468,7 +468,10 @@ enum hip_api_id_t {
|
||||
HIP_API_ID_hipKernelGetName = 448,
|
||||
HIP_API_ID_hipOccupancyAvailableDynamicSMemPerBlock = 449,
|
||||
HIP_API_ID_hipKernelGetParamInfo = 450,
|
||||
HIP_API_ID_LAST = 450,
|
||||
HIP_API_ID_hipExtDisableLogging = 451,
|
||||
HIP_API_ID_hipExtEnableLogging = 452,
|
||||
HIP_API_ID_hipExtSetLoggingParams = 453,
|
||||
HIP_API_ID_LAST = 453,
|
||||
|
||||
HIP_API_ID_hipChooseDevice = HIP_API_ID_CONCAT(HIP_API_ID_,hipChooseDevice),
|
||||
HIP_API_ID_hipGetDeviceProperties = HIP_API_ID_CONCAT(HIP_API_ID_,hipGetDeviceProperties),
|
||||
@@ -590,12 +593,15 @@ static inline const char* hip_api_name(const uint32_t id) {
|
||||
case HIP_API_ID_hipEventRecord: return "hipEventRecord";
|
||||
case HIP_API_ID_hipEventRecordWithFlags: return "hipEventRecordWithFlags";
|
||||
case HIP_API_ID_hipEventSynchronize: return "hipEventSynchronize";
|
||||
case HIP_API_ID_hipExtDisableLogging: return "hipExtDisableLogging";
|
||||
case HIP_API_ID_hipExtEnableLogging: return "hipExtEnableLogging";
|
||||
case HIP_API_ID_hipExtGetLastError: return "hipExtGetLastError";
|
||||
case HIP_API_ID_hipExtGetLinkTypeAndHopCount: return "hipExtGetLinkTypeAndHopCount";
|
||||
case HIP_API_ID_hipExtLaunchKernel: return "hipExtLaunchKernel";
|
||||
case HIP_API_ID_hipExtLaunchMultiKernelMultiDevice: return "hipExtLaunchMultiKernelMultiDevice";
|
||||
case HIP_API_ID_hipExtMallocWithFlags: return "hipExtMallocWithFlags";
|
||||
case HIP_API_ID_hipExtModuleLaunchKernel: return "hipExtModuleLaunchKernel";
|
||||
case HIP_API_ID_hipExtSetLoggingParams: return "hipExtSetLoggingParams";
|
||||
case HIP_API_ID_hipExtStreamCreateWithCUMask: return "hipExtStreamCreateWithCUMask";
|
||||
case HIP_API_ID_hipExtStreamGetCUMask: return "hipExtStreamGetCUMask";
|
||||
case HIP_API_ID_hipExternalMemoryGetMappedBuffer: return "hipExternalMemoryGetMappedBuffer";
|
||||
@@ -1034,12 +1040,15 @@ static inline uint32_t hipApiIdByName(const char* name) {
|
||||
if (strcmp("hipEventRecord", name) == 0) return HIP_API_ID_hipEventRecord;
|
||||
if (strcmp("hipEventRecordWithFlags", name) == 0) return HIP_API_ID_hipEventRecordWithFlags;
|
||||
if (strcmp("hipEventSynchronize", name) == 0) return HIP_API_ID_hipEventSynchronize;
|
||||
if (strcmp("hipExtDisableLogging", name) == 0) return HIP_API_ID_hipExtDisableLogging;
|
||||
if (strcmp("hipExtEnableLogging", name) == 0) return HIP_API_ID_hipExtEnableLogging;
|
||||
if (strcmp("hipExtGetLastError", name) == 0) return HIP_API_ID_hipExtGetLastError;
|
||||
if (strcmp("hipExtGetLinkTypeAndHopCount", name) == 0) return HIP_API_ID_hipExtGetLinkTypeAndHopCount;
|
||||
if (strcmp("hipExtLaunchKernel", name) == 0) return HIP_API_ID_hipExtLaunchKernel;
|
||||
if (strcmp("hipExtLaunchMultiKernelMultiDevice", name) == 0) return HIP_API_ID_hipExtLaunchMultiKernelMultiDevice;
|
||||
if (strcmp("hipExtMallocWithFlags", name) == 0) return HIP_API_ID_hipExtMallocWithFlags;
|
||||
if (strcmp("hipExtModuleLaunchKernel", name) == 0) return HIP_API_ID_hipExtModuleLaunchKernel;
|
||||
if (strcmp("hipExtSetLoggingParams", name) == 0) return HIP_API_ID_hipExtSetLoggingParams;
|
||||
if (strcmp("hipExtStreamCreateWithCUMask", name) == 0) return HIP_API_ID_hipExtStreamCreateWithCUMask;
|
||||
if (strcmp("hipExtStreamGetCUMask", name) == 0) return HIP_API_ID_hipExtStreamGetCUMask;
|
||||
if (strcmp("hipExternalMemoryGetMappedBuffer", name) == 0) return HIP_API_ID_hipExternalMemoryGetMappedBuffer;
|
||||
@@ -1851,6 +1860,11 @@ typedef struct hip_api_data_s {
|
||||
hipEvent_t stopEvent;
|
||||
unsigned int flags;
|
||||
} hipExtModuleLaunchKernel;
|
||||
struct {
|
||||
size_t log_level;
|
||||
size_t log_size;
|
||||
size_t log_mask;
|
||||
} hipExtSetLoggingParams;
|
||||
struct {
|
||||
hipStream_t* stream;
|
||||
hipStream_t stream__val;
|
||||
@@ -4484,6 +4498,12 @@ typedef struct hip_api_data_s {
|
||||
#define INIT_hipEventSynchronize_CB_ARGS_DATA(cb_data) { \
|
||||
cb_data.args.hipEventSynchronize.event = (hipEvent_t)event; \
|
||||
};
|
||||
// hipExtDisableLogging[]
|
||||
#define INIT_hipExtDisableLogging_CB_ARGS_DATA(cb_data) { \
|
||||
};
|
||||
// hipExtEnableLogging[]
|
||||
#define INIT_hipExtEnableLogging_CB_ARGS_DATA(cb_data) { \
|
||||
};
|
||||
// hipExtGetLastError[]
|
||||
#define INIT_hipExtGetLastError_CB_ARGS_DATA(cb_data) { \
|
||||
};
|
||||
@@ -4535,6 +4555,12 @@ typedef struct hip_api_data_s {
|
||||
cb_data.args.hipExtModuleLaunchKernel.stopEvent = (hipEvent_t)stopEvent; \
|
||||
cb_data.args.hipExtModuleLaunchKernel.flags = (unsigned int)flags; \
|
||||
};
|
||||
// hipExtSetLoggingParams[('size_t', 'log_level'), ('size_t', 'log_size'), ('size_t', 'log_mask')]
|
||||
#define INIT_hipExtSetLoggingParams_CB_ARGS_DATA(cb_data) { \
|
||||
cb_data.args.hipExtSetLoggingParams.log_level = (size_t)log_level; \
|
||||
cb_data.args.hipExtSetLoggingParams.log_size = (size_t)log_size; \
|
||||
cb_data.args.hipExtSetLoggingParams.log_mask = (size_t)log_mask; \
|
||||
};
|
||||
// hipExtStreamCreateWithCUMask[('hipStream_t*', 'stream'), ('unsigned int', 'cuMaskSize'), ('const unsigned int*', 'cuMask')]
|
||||
#define INIT_hipExtStreamCreateWithCUMask_CB_ARGS_DATA(cb_data) { \
|
||||
cb_data.args.hipExtStreamCreateWithCUMask.stream = (hipStream_t*)stream; \
|
||||
@@ -7125,6 +7151,12 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) {
|
||||
// hipEventSynchronize[('hipEvent_t', 'event')]
|
||||
case HIP_API_ID_hipEventSynchronize:
|
||||
break;
|
||||
// hipExtDisableLogging[]
|
||||
case HIP_API_ID_hipExtDisableLogging:
|
||||
break;
|
||||
// hipExtEnableLogging[]
|
||||
case HIP_API_ID_hipExtEnableLogging:
|
||||
break;
|
||||
// hipExtGetLastError[]
|
||||
case HIP_API_ID_hipExtGetLastError:
|
||||
break;
|
||||
@@ -7150,6 +7182,9 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) {
|
||||
if (data->args.hipExtModuleLaunchKernel.kernelParams) data->args.hipExtModuleLaunchKernel.kernelParams__val = *(data->args.hipExtModuleLaunchKernel.kernelParams);
|
||||
if (data->args.hipExtModuleLaunchKernel.extra) data->args.hipExtModuleLaunchKernel.extra__val = *(data->args.hipExtModuleLaunchKernel.extra);
|
||||
break;
|
||||
// hipExtSetLoggingParams[('size_t', 'log_level'), ('size_t', 'log_size'), ('size_t', 'log_mask')]
|
||||
case HIP_API_ID_hipExtSetLoggingParams:
|
||||
break;
|
||||
// hipExtStreamCreateWithCUMask[('hipStream_t*', 'stream'), ('unsigned int', 'cuMaskSize'), ('const unsigned int*', 'cuMask')]
|
||||
case HIP_API_ID_hipExtStreamCreateWithCUMask:
|
||||
if (data->args.hipExtStreamCreateWithCUMask.stream) data->args.hipExtStreamCreateWithCUMask.stream__val = *(data->args.hipExtStreamCreateWithCUMask.stream);
|
||||
@@ -9124,6 +9159,14 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da
|
||||
oss << "event="; roctracer::hip_support::detail::operator<<(oss, data->args.hipEventSynchronize.event);
|
||||
oss << ")";
|
||||
break;
|
||||
case HIP_API_ID_hipExtDisableLogging:
|
||||
oss << "hipExtDisableLogging(";
|
||||
oss << ")";
|
||||
break;
|
||||
case HIP_API_ID_hipExtEnableLogging:
|
||||
oss << "hipExtEnableLogging(";
|
||||
oss << ")";
|
||||
break;
|
||||
case HIP_API_ID_hipExtGetLastError:
|
||||
oss << "hipExtGetLastError(";
|
||||
oss << ")";
|
||||
@@ -9188,6 +9231,13 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da
|
||||
oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtModuleLaunchKernel.flags);
|
||||
oss << ")";
|
||||
break;
|
||||
case HIP_API_ID_hipExtSetLoggingParams:
|
||||
oss << "hipExtSetLoggingParams(";
|
||||
oss << "log_level="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtSetLoggingParams.log_level);
|
||||
oss << ", log_size="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtSetLoggingParams.log_size);
|
||||
oss << ", log_mask="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtSetLoggingParams.log_mask);
|
||||
oss << ")";
|
||||
break;
|
||||
case HIP_API_ID_hipExtStreamCreateWithCUMask:
|
||||
oss << "hipExtStreamCreateWithCUMask(";
|
||||
if (data->args.hipExtStreamCreateWithCUMask.stream == NULL) oss << "stream=NULL";
|
||||
|
||||
@@ -110,6 +110,7 @@ target_sources(amdhip64 PRIVATE
|
||||
hip_graph.cpp
|
||||
hip_hmm.cpp
|
||||
hip_intercept.cpp
|
||||
hip_log.cpp
|
||||
hip_memory.cpp
|
||||
hip_mempool.cpp
|
||||
hip_mempool_impl.cpp
|
||||
|
||||
@@ -523,3 +523,6 @@ hipKernelGetName
|
||||
hipOccupancyAvailableDynamicSMemPerBlock
|
||||
hipGetProcAddress_spt
|
||||
hipKernelGetParamInfo
|
||||
hipExtDisableLogging
|
||||
hipExtEnableLogging
|
||||
hipExtSetLoggingParams
|
||||
|
||||
@@ -885,6 +885,9 @@ hipError_t hipOccupancyAvailableDynamicSMemPerBlock(size_t* dynamicSmemSize, con
|
||||
int numBlocks, int blockSize);
|
||||
hipError_t hipKernelGetParamInfo(hipKernel_t kernel, size_t paramIndex, size_t* paramOffset,
|
||||
size_t* paramSize);
|
||||
hipError_t hipExtDisableLogging();
|
||||
hipError_t hipExtEnableLogging();
|
||||
hipError_t hipExtSetLoggingParams(size_t log_level, size_t log_size, size_t log_mask);
|
||||
} // namespace hip
|
||||
|
||||
namespace hip {
|
||||
@@ -1432,6 +1435,9 @@ void UpdateDispatchTable(HipDispatchTable* ptrDispatchTable) {
|
||||
ptrDispatchTable->hipKernelGetName_fn = hip::hipKernelGetName;
|
||||
ptrDispatchTable->hipOccupancyAvailableDynamicSMemPerBlock_fn = hip::hipOccupancyAvailableDynamicSMemPerBlock;
|
||||
ptrDispatchTable->hipKernelGetParamInfo_fn = hip::hipKernelGetParamInfo;
|
||||
ptrDispatchTable->hipExtDisableLogging_fn = hip::hipExtDisableLogging;
|
||||
ptrDispatchTable->hipExtEnableLogging_fn = hip::hipExtEnableLogging;
|
||||
ptrDispatchTable->hipExtSetLoggingParams_fn = hip::hipExtSetLoggingParams;
|
||||
}
|
||||
|
||||
#if HIP_ROCPROFILER_REGISTER > 0
|
||||
@@ -2114,15 +2120,19 @@ HIP_ENFORCE_ABI(HipDispatchTable, hipOccupancyAvailableDynamicSMemPerBlock_fn, 5
|
||||
HIP_ENFORCE_ABI(HipDispatchTable, hipGetProcAddress_spt_fn, 506);
|
||||
// HIP_RUNTIME_API_TABLE_STEP_VERSION == 20
|
||||
HIP_ENFORCE_ABI(HipDispatchTable, hipKernelGetParamInfo_fn, 507);
|
||||
// HIP_RUNTIME_API_TABLE_STEP_VERSION == 21
|
||||
HIP_ENFORCE_ABI(HipDispatchTable, hipExtDisableLogging_fn, 508);
|
||||
HIP_ENFORCE_ABI(HipDispatchTable, hipExtEnableLogging_fn, 509);
|
||||
HIP_ENFORCE_ABI(HipDispatchTable, hipExtSetLoggingParams_fn, 510);
|
||||
// if HIP_ENFORCE_ABI entries are added for each new function pointer in the table, the number below
|
||||
// will be +1 of the number in the last HIP_ENFORCE_ABI line. E.g.:
|
||||
//
|
||||
// HIP_ENFORCE_ABI(<table>, <functor>, 8)
|
||||
//
|
||||
// HIP_ENFORCE_ABI_VERSIONING(<table>, 9) <- 8 + 1 = 9
|
||||
HIP_ENFORCE_ABI_VERSIONING(HipDispatchTable, 508)
|
||||
HIP_ENFORCE_ABI_VERSIONING(HipDispatchTable, 511)
|
||||
|
||||
static_assert(HIP_RUNTIME_API_TABLE_MAJOR_VERSION == 0 && HIP_RUNTIME_API_TABLE_STEP_VERSION == 20,
|
||||
static_assert(HIP_RUNTIME_API_TABLE_MAJOR_VERSION == 0 && HIP_RUNTIME_API_TABLE_STEP_VERSION == 21,
|
||||
"If you get this error, add new HIP_ENFORCE_ABI(...) code for the new function "
|
||||
"pointers and then update this check so it is true");
|
||||
#endif
|
||||
|
||||
@@ -645,6 +645,9 @@ global:
|
||||
hipOccupancyAvailableDynamicSMemPerBlock;
|
||||
hipGetProcAddress_spt;
|
||||
hipKernelGetParamInfo;
|
||||
hipExtDisableLogging;
|
||||
hipExtEnableLogging;
|
||||
hipExtSetLoggingParams;
|
||||
local:
|
||||
*;
|
||||
} hip_7.1;
|
||||
|
||||
@@ -0,0 +1,31 @@
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "hip_internal.hpp"
|
||||
#include "hip_platform.hpp"
|
||||
|
||||
namespace hip {
|
||||
|
||||
hipError_t hipExtEnableLogging() {
|
||||
HIP_INIT_API(hipExtEnableLogging);
|
||||
amd::ScopedLock lock(PlatformState::instance().getLogLock());
|
||||
AMD_LOG_LEVEL = PlatformState::instance().log_level_;
|
||||
AMD_LOG_MASK = PlatformState::instance().log_mask_;
|
||||
HIP_RETURN(hipSuccess);
|
||||
}
|
||||
|
||||
hipError_t hipExtDisableLogging() {
|
||||
HIP_INIT_API(hipExtDisableLogging);
|
||||
amd::ScopedLock lock(PlatformState::instance().getLogLock());
|
||||
AMD_LOG_LEVEL = 0;
|
||||
HIP_RETURN(hipSuccess);
|
||||
}
|
||||
|
||||
hipError_t hipExtSetLoggingParams(size_t log_level, size_t log_size, size_t log_mask) {
|
||||
HIP_INIT_API(hipExtSetLoggingParams, log_level, log_size, log_mask);
|
||||
amd::ScopedLock lock(PlatformState::instance().getLogLock());
|
||||
// Store logging parameters for later activation
|
||||
PlatformState::instance().log_level_ = log_level;
|
||||
PlatformState::instance().log_size_ = log_size;
|
||||
PlatformState::instance().log_mask_ = log_mask;
|
||||
HIP_RETURN(hipSuccess);
|
||||
}
|
||||
} // namespace::hip
|
||||
@@ -50,9 +50,12 @@ class PlatformState {
|
||||
// Unique FD Store Lock
|
||||
amd::Monitor ufd_lock_{true};
|
||||
|
||||
// Lock for logging operations
|
||||
amd::Monitor lg_lock_{true};
|
||||
|
||||
// Singleton object
|
||||
static PlatformState* platform_;
|
||||
PlatformState() {}
|
||||
PlatformState() : log_level_(0), log_size_(0), log_mask_(0) {}
|
||||
~PlatformState() {}
|
||||
|
||||
public:
|
||||
@@ -113,6 +116,14 @@ class PlatformState {
|
||||
|
||||
size_t UfdMapSize() const { return ufd_map_.size(); }
|
||||
|
||||
// Logging lock accessor
|
||||
amd::Monitor& getLogLock() { return lg_lock_; }
|
||||
|
||||
// Friend functions for logging access
|
||||
friend hipError_t hipExtEnableLogging();
|
||||
friend hipError_t hipExtDisableLogging();
|
||||
friend hipError_t hipExtSetLoggingParams(size_t log_level, size_t log_size, size_t log_mask);
|
||||
|
||||
inline bool RegisterLibraryFunction(const hipKernel_t f, const hipLibrary_t l) {
|
||||
amd::ScopedLock lock(lock_);
|
||||
if (library_functions_.find(f) == library_functions_.end()) {
|
||||
@@ -150,5 +161,10 @@ class PlatformState {
|
||||
|
||||
void* dynamicLibraryHandle_{nullptr};
|
||||
std::unordered_map<hipKernel_t, hipLibrary_t> library_functions_;
|
||||
|
||||
// Logging state (moved from LoggingInfo singleton)
|
||||
size_t log_level_;
|
||||
size_t log_size_;
|
||||
size_t log_mask_;
|
||||
};
|
||||
} // namespace hip
|
||||
|
||||
@@ -2067,4 +2067,13 @@ hipError_t hipKernelGetParamInfo(hipKernel_t kernel, size_t paramIndex, size_t*
|
||||
size_t* paramSize) {
|
||||
return hip::GetHipDispatchTable()->hipKernelGetParamInfo_fn(kernel, paramIndex, paramOffset,
|
||||
paramSize);
|
||||
}
|
||||
hipError_t hipExtEnableLogging() {
|
||||
return hip::GetHipDispatchTable()->hipExtEnableLogging_fn();
|
||||
}
|
||||
hipError_t hipExtDisableLogging() {
|
||||
return hip::GetHipDispatchTable()->hipExtDisableLogging_fn();
|
||||
}
|
||||
hipError_t hipExtSetLoggingParams(size_t log_level, size_t log_size, size_t log_mask) {
|
||||
return hip::GetHipDispatchTable()->hipExtSetLoggingParams_fn(log_level, log_size, log_mask);
|
||||
}
|
||||
@@ -138,14 +138,6 @@
|
||||
"=== TODO ===",
|
||||
"Unit_Device_tgammaf_Accuracy_Limited_Positive",
|
||||
"=== TODO === fail on 100% test data",
|
||||
"Unit_Device_hexp10_Accuracy_Positive",
|
||||
"Unit_Device_h2exp10_Accuracy_Positive",
|
||||
"Unit_Device_hexp2_Accuracy_Positive",
|
||||
"Unit_Device_h2exp2_Accuracy_Positive",
|
||||
"Unit_Device_hlog_Accuracy_Positive",
|
||||
"Unit_Device_h2log_Accuracy_Positive",
|
||||
"Unit_Device_hlog10_Accuracy_Positive",
|
||||
"Unit_Device_h2log10_Accuracy_Positive",
|
||||
"Unit_Device___hfma2_Accuracy_Positive",
|
||||
#endif
|
||||
#if defined gfx90a || defined gfx942 || defined gfx950
|
||||
|
||||
@@ -752,6 +752,8 @@
|
||||
#endif
|
||||
"=== Following tests disabled as it should be a local perf test",
|
||||
"Performance_hipExtLaunchKernelGGL_QueryGPUFrequency",
|
||||
"Unit_hipDynamicLogging_Positive_Basic",
|
||||
"Unit_hipDynamicLogging_Positive_MultipleEnableDisable",
|
||||
"End of json"
|
||||
]
|
||||
}
|
||||
|
||||
@@ -20,6 +20,7 @@ set(TEST_SRC
|
||||
binary_partition.cc
|
||||
cg_ballot.cc
|
||||
cg_any_all.cc
|
||||
split_barrier.cc
|
||||
)
|
||||
|
||||
if(HIP_PLATFORM STREQUAL "nvidia")
|
||||
|
||||
@@ -0,0 +1,123 @@
|
||||
/*
|
||||
Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
#include <hip_test_common.hh>
|
||||
|
||||
static __global__ void wg_split_barrier(float *out, float *in) {
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
__shared__ float mid[32];
|
||||
size_t i = threadIdx.x;
|
||||
auto tb = cg::this_thread_block();
|
||||
|
||||
out[i] = in[i] * 2.0f;
|
||||
|
||||
auto tok = tb.barrier_arrive();
|
||||
|
||||
// use tid 0 to populate shared mem
|
||||
if (i == 0) {
|
||||
for (size_t j = 0; j < 32; j++) {
|
||||
mid[j] = in[j];
|
||||
}
|
||||
}
|
||||
|
||||
tb.barrier_wait(std::move(tok));
|
||||
|
||||
out[i] += mid[i];
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_coop_thread_block_split_barrier") {
|
||||
constexpr size_t size = 32;
|
||||
float *d_out, *d_in;
|
||||
|
||||
HIP_CHECK(hipMalloc(&d_out, sizeof(float) * size));
|
||||
HIP_CHECK(hipMalloc(&d_in, sizeof(float) * size));
|
||||
|
||||
std::vector<float> in(size, 0.0f), out = in;
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
in[i] = i + 1;
|
||||
}
|
||||
|
||||
HIP_CHECK(hipMemset(d_out, 0, sizeof(float) * size));
|
||||
HIP_CHECK(
|
||||
hipMemcpy(d_in, in.data(), sizeof(float) * size, hipMemcpyHostToDevice));
|
||||
wg_split_barrier<<<1, size>>>(d_out, d_in);
|
||||
HIP_CHECK(hipMemcpy(out.data(), d_out, sizeof(float) * size,
|
||||
hipMemcpyDeviceToHost));
|
||||
|
||||
HIP_CHECK(hipFree(d_out));
|
||||
HIP_CHECK(hipFree(d_in));
|
||||
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
INFO("Index: " << i << " in: " << in[i] << " out: " << out[i]);
|
||||
REQUIRE((in[i] * 3.0f) == Catch::Approx(out[i]));
|
||||
}
|
||||
}
|
||||
|
||||
static __global__ void grid_split_barrier(int *data, int *result, int N) {
|
||||
namespace cg = cooperative_groups;
|
||||
cg::grid_group grid = cg::this_grid();
|
||||
|
||||
int gid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
auto tok = grid.barrier_arrive();
|
||||
if (gid < N) {
|
||||
data[gid] = gid + 1;
|
||||
}
|
||||
|
||||
grid.barrier_wait(std::move(tok));
|
||||
|
||||
if (grid.thread_rank() == 0) {
|
||||
int sum = 0;
|
||||
for (int i = 0; i < N; i++)
|
||||
sum += data[i];
|
||||
*result = sum;
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Unit_coop_grids_split_barrier") {
|
||||
hipDeviceProp_t prop;
|
||||
HIP_CHECK(hipGetDeviceProperties(&prop, 0));
|
||||
|
||||
if (prop.cooperativeLaunch != 0) {
|
||||
int N = 1024;
|
||||
const int threads = 128;
|
||||
const int blocks = (N + threads - 1) / threads;
|
||||
|
||||
int *d_in, *d_out;
|
||||
HIP_CHECK(hipMalloc(&d_in, N * sizeof(int)));
|
||||
HIP_CHECK(hipMalloc(&d_out, sizeof(int)));
|
||||
|
||||
void *args[] = {&d_in, &d_out, &N};
|
||||
|
||||
dim3 grid(blocks);
|
||||
dim3 block(threads);
|
||||
|
||||
HIP_CHECK(hipLaunchCooperativeKernel((void *)grid_split_barrier, grid,
|
||||
block, args, 0, 0));
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
|
||||
int out = 0;
|
||||
HIP_CHECK(hipMemcpy(&out, d_out, sizeof(int), hipMemcpyDeviceToHost));
|
||||
|
||||
HIP_CHECK(hipFree(d_in));
|
||||
HIP_CHECK(hipFree(d_out));
|
||||
REQUIRE(out == ((N * (N + 1)) / 2));
|
||||
}
|
||||
}
|
||||
@@ -7,6 +7,7 @@ set(TEST_SRC
|
||||
hipDrvGetErrorString.cc
|
||||
hipGetLastError.cc
|
||||
hipPeekAtLastError.cc
|
||||
hipDynamicLogging.cc
|
||||
)
|
||||
|
||||
if(UNIX)
|
||||
|
||||
@@ -0,0 +1,134 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <streambuf>
|
||||
#include <string>
|
||||
#include <fstream>
|
||||
#include <fcntl.h>
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <windows.h>
|
||||
#include <io.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#define dup _dup
|
||||
#define dup2 _dup2
|
||||
#define fd_close _close
|
||||
#define unlink _unlink
|
||||
#define STDERR_FD _fileno(stderr)
|
||||
#define OPEN_FLAGS (_O_WRONLY | _O_CREAT | _O_TRUNC)
|
||||
#define OPEN_MODE (_S_IREAD | _S_IWRITE)
|
||||
#define open _open
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#define fd_close close
|
||||
#define STDERR_FD STDERR_FILENO
|
||||
#define OPEN_FLAGS (O_WRONLY | O_CREAT | O_TRUNC)
|
||||
#define OPEN_MODE 0644
|
||||
#endif
|
||||
|
||||
// Class to capture all stderr output (HIP logging uses stderr)
|
||||
class OutCapture {
|
||||
private:
|
||||
std::stringstream captured_stream_;
|
||||
std::streambuf* cerr_backup_;
|
||||
int stderr_backup_;
|
||||
std::string temp_file_;
|
||||
|
||||
static std::string getTempFilePath() {
|
||||
#ifdef _WIN32
|
||||
char temp_path[MAX_PATH];
|
||||
if (GetTempPathA(MAX_PATH, temp_path)) {
|
||||
return std::string(temp_path) + "hip_stderr_capture.txt";
|
||||
}
|
||||
// Fallback to current directory
|
||||
return "hip_stderr_capture.txt";
|
||||
#else
|
||||
return "/tmp/hip_stderr_capture.txt";
|
||||
#endif
|
||||
}
|
||||
|
||||
public:
|
||||
OutCapture() : temp_file_(getTempFilePath()) {
|
||||
// Backup original cerr stream buffer (HIP logging uses stderr)
|
||||
cerr_backup_ = std::cerr.rdbuf();
|
||||
|
||||
// Backup original stderr file descriptor
|
||||
stderr_backup_ = dup(STDERR_FD);
|
||||
}
|
||||
|
||||
void startCapture() {
|
||||
// Clear any previous content
|
||||
captured_stream_.str("");
|
||||
captured_stream_.clear();
|
||||
|
||||
// Redirect std::cerr to our stringstream
|
||||
std::cerr.rdbuf(captured_stream_.rdbuf());
|
||||
|
||||
// Redirect stderr file descriptor to temp file (for fprintf to stderr)
|
||||
int temp_fd = open(temp_file_.c_str(), OPEN_FLAGS, OPEN_MODE);
|
||||
if (temp_fd != -1) {
|
||||
dup2(temp_fd, STDERR_FD);
|
||||
fd_close(temp_fd);
|
||||
}
|
||||
}
|
||||
|
||||
std::string stopCapture() {
|
||||
// Restore original cerr stream
|
||||
std::cerr.rdbuf(cerr_backup_);
|
||||
|
||||
// Restore original stderr file descriptor
|
||||
dup2(stderr_backup_, STDERR_FD);
|
||||
|
||||
// Read from temp file (captures fprintf(stderr) output from HIP logging)
|
||||
std::ifstream temp_file(temp_file_);
|
||||
std::string file_content;
|
||||
if (temp_file.is_open()) {
|
||||
std::string line;
|
||||
while (std::getline(temp_file, line)) {
|
||||
file_content += line + "\n";
|
||||
}
|
||||
temp_file.close();
|
||||
}
|
||||
|
||||
// Combine both captures: C++ streams and file descriptor output
|
||||
std::string stream_content = captured_stream_.str();
|
||||
std::string total_output = stream_content + file_content;
|
||||
|
||||
// Clean up temp file
|
||||
unlink(temp_file_.c_str());
|
||||
|
||||
return total_output;
|
||||
}
|
||||
|
||||
~OutCapture() {
|
||||
// Ensure everything is restored
|
||||
std::cerr.rdbuf(cerr_backup_);
|
||||
dup2(stderr_backup_, STDERR_FD);
|
||||
fd_close(stderr_backup_);
|
||||
unlink(temp_file_.c_str());
|
||||
}
|
||||
};
|
||||
@@ -0,0 +1,156 @@
|
||||
/*
|
||||
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <hip_test_common.hh>
|
||||
#include "OutCapture.hh"
|
||||
|
||||
/**
|
||||
* @addtogroup hipDynamicLogging hipDynamicLogging
|
||||
* @{
|
||||
* @ingroup ErrorTest
|
||||
* `hipExtSetLoggingParams(size_t log_level, size_t log_size, size_t log_mask)` -
|
||||
* Sets logging parameters for HIP runtime.
|
||||
* `hipExtEnableLogging()` -
|
||||
* Enables HIP runtime logging.
|
||||
* `hipExtDisableLogging()` -
|
||||
* Disables HIP runtime logging.
|
||||
*/
|
||||
|
||||
static bool hipDynamicLoggingTest() {
|
||||
// Create output capture instance
|
||||
OutCapture capture;
|
||||
capture.startCapture();
|
||||
|
||||
// Set Logging params
|
||||
HIP_CHECK(hipExtSetLoggingParams(4, 0, -1));
|
||||
|
||||
// Logging is disabled here - allocate memory
|
||||
int* dptr = nullptr;
|
||||
HIP_CHECK(hipMalloc(&dptr, sizeof(int)));
|
||||
|
||||
// Stop capture after hipMalloc and check no output (logging disabled)
|
||||
std::string malloc_output = capture.stopCapture();
|
||||
if (malloc_output.size() != 0) {
|
||||
INFO("Unexpected logging output during hipMalloc (logging should be disabled): " << malloc_output);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Start capture before enabling logging
|
||||
capture.startCapture();
|
||||
|
||||
// Enable logging and do memset
|
||||
HIP_CHECK(hipExtEnableLogging());
|
||||
HIP_CHECK(hipMemset(dptr, 0x00, sizeof(int)));
|
||||
|
||||
// Disable logging
|
||||
HIP_CHECK(hipExtDisableLogging());
|
||||
|
||||
// Stop capture after disabling logging and check for output
|
||||
std::string logging_output = capture.stopCapture();
|
||||
if (logging_output.size() == 0) {
|
||||
INFO("Expected logging output during enabled logging period, but got none");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Clean up
|
||||
HIP_CHECK(hipFree(dptr));
|
||||
|
||||
INFO("Successfully captured HIP logging output (" << logging_output.size() << " bytes)");
|
||||
INFO("Logging output: " << logging_output);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Validates that HIP dynamic logging works correctly:
|
||||
* 1. No output when logging is disabled
|
||||
* 2. Logging output is captured when logging is enabled
|
||||
* 3. hipMemset operation produces logging output during enabled period
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - unit/errorHandling/hipDynamicLogging.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.6
|
||||
*/
|
||||
TEST_CASE("Unit_hipDynamicLogging_Positive_Basic") {
|
||||
int numDevices = 0;
|
||||
HIP_CHECK(hipGetDeviceCount(&numDevices));
|
||||
|
||||
if (numDevices <= 0) {
|
||||
HipTest::HIP_SKIP_TEST("Skipping hipDynamicLogging test - no devices available");
|
||||
return;
|
||||
}
|
||||
|
||||
REQUIRE(hipDynamicLoggingTest() == true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test Description
|
||||
* ------------------------
|
||||
* - Validates that hipExtSetLoggingParams sets logging parameters correctly
|
||||
* and that logging can be enabled/disabled multiple times
|
||||
* Test source
|
||||
* ------------------------
|
||||
* - unit/errorHandling/hipDynamicLogging.cc
|
||||
* Test requirements
|
||||
* ------------------------
|
||||
* - HIP_VERSION >= 5.6
|
||||
*/
|
||||
TEST_CASE("Unit_hipDynamicLogging_Positive_MultipleEnableDisable") {
|
||||
int numDevices = 0;
|
||||
HIP_CHECK(hipGetDeviceCount(&numDevices));
|
||||
|
||||
if (numDevices <= 0) {
|
||||
HipTest::HIP_SKIP_TEST("Skipping hipDynamicLogging test - no devices available");
|
||||
return;
|
||||
}
|
||||
|
||||
// Test multiple enable/disable cycles
|
||||
OutCapture capture;
|
||||
int* dptr = nullptr;
|
||||
HIP_CHECK(hipMalloc(&dptr, sizeof(int)));
|
||||
|
||||
// Set different logging parameters
|
||||
HIP_CHECK(hipExtSetLoggingParams(3, 0, -1));
|
||||
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
// Start capture and enable logging
|
||||
capture.startCapture();
|
||||
HIP_CHECK(hipExtEnableLogging());
|
||||
HIP_CHECK(hipMemset(dptr, 0x42, sizeof(int)));
|
||||
HIP_CHECK(hipExtDisableLogging());
|
||||
|
||||
// Check that we captured some output
|
||||
std::string output = capture.stopCapture();
|
||||
REQUIRE(output.size() > 0);
|
||||
}
|
||||
|
||||
HIP_CHECK(hipFree(dptr));
|
||||
}
|
||||
|
||||
/**
|
||||
* End doxygen group ErrorTest.
|
||||
* @}
|
||||
*/
|
||||
@@ -45,7 +45,7 @@ MATH_UNARY_HP_KERNEL_DEF(hcos);
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
MATH_UNARY_HP_TEST_DEF_IMPL(hcos, static_cast<float (*)(float)>(std::cos),
|
||||
ULPValidatorBuilderFactory<float>(2));
|
||||
ULPValidatorBuilderFactory<Float16>(1));
|
||||
|
||||
MATH_UNARY_HP_KERNEL_DEF(h2cos);
|
||||
|
||||
@@ -63,7 +63,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2cos);
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
MATH_UNARY_HP_TEST_DEF_IMPL(h2cos, static_cast<float (*)(float)>(std::cos),
|
||||
ULPValidatorBuilderFactory<float>(2));
|
||||
ULPValidatorBuilderFactory<Float16>(1));
|
||||
|
||||
|
||||
MATH_UNARY_HP_KERNEL_DEF(hsin);
|
||||
@@ -82,7 +82,7 @@ MATH_UNARY_HP_KERNEL_DEF(hsin);
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
MATH_UNARY_HP_TEST_DEF_IMPL(hsin, static_cast<float (*)(float)>(std::sin),
|
||||
ULPValidatorBuilderFactory<float>(2));
|
||||
ULPValidatorBuilderFactory<Float16>(1));
|
||||
|
||||
MATH_UNARY_HP_KERNEL_DEF(h2sin);
|
||||
|
||||
@@ -100,7 +100,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2sin);
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
MATH_UNARY_HP_TEST_DEF_IMPL(h2sin, static_cast<float (*)(float)>(std::sin),
|
||||
ULPValidatorBuilderFactory<float>(2));
|
||||
ULPValidatorBuilderFactory<Float16>(1));
|
||||
|
||||
|
||||
MATH_UNARY_HP_KERNEL_DEF(hexp);
|
||||
@@ -119,7 +119,7 @@ MATH_UNARY_HP_KERNEL_DEF(hexp);
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
MATH_UNARY_HP_TEST_DEF_IMPL(hexp, static_cast<float (*)(float)>(std::exp),
|
||||
ULPValidatorBuilderFactory<float>(2));
|
||||
ULPValidatorBuilderFactory<Float16>(1));
|
||||
|
||||
MATH_UNARY_HP_KERNEL_DEF(h2exp);
|
||||
|
||||
@@ -137,7 +137,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2exp);
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
MATH_UNARY_HP_TEST_DEF_IMPL(h2exp, static_cast<float (*)(float)>(std::exp),
|
||||
ULPValidatorBuilderFactory<float>(2));
|
||||
ULPValidatorBuilderFactory<Float16>(1));
|
||||
|
||||
|
||||
MATH_UNARY_HP_KERNEL_DEF(hexp10);
|
||||
@@ -156,7 +156,7 @@ MATH_UNARY_HP_KERNEL_DEF(hexp10);
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
MATH_UNARY_HP_TEST_DEF_IMPL(hexp10, static_cast<float (*)(float)>(exp10f),
|
||||
ULPValidatorBuilderFactory<float>(2));
|
||||
ULPValidatorBuilderFactory<Float16>(1));
|
||||
|
||||
MATH_UNARY_HP_KERNEL_DEF(h2exp10);
|
||||
|
||||
@@ -174,7 +174,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2exp10);
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
MATH_UNARY_HP_TEST_DEF_IMPL(h2exp10, static_cast<float (*)(float)>(exp10f),
|
||||
ULPValidatorBuilderFactory<float>(2));
|
||||
ULPValidatorBuilderFactory<Float16>(1));
|
||||
|
||||
|
||||
MATH_UNARY_HP_KERNEL_DEF(hexp2);
|
||||
@@ -193,7 +193,7 @@ MATH_UNARY_HP_KERNEL_DEF(hexp2);
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
MATH_UNARY_HP_TEST_DEF_IMPL(hexp2, static_cast<float (*)(float)>(std::exp2),
|
||||
ULPValidatorBuilderFactory<float>(2));
|
||||
ULPValidatorBuilderFactory<Float16>(1));
|
||||
|
||||
MATH_UNARY_HP_KERNEL_DEF(h2exp2);
|
||||
|
||||
@@ -211,7 +211,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2exp2);
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
MATH_UNARY_HP_TEST_DEF_IMPL(h2exp2, static_cast<float (*)(float)>(std::exp2),
|
||||
ULPValidatorBuilderFactory<float>(2));
|
||||
ULPValidatorBuilderFactory<Float16>(1));
|
||||
|
||||
|
||||
MATH_UNARY_HP_KERNEL_DEF(hlog);
|
||||
@@ -230,7 +230,7 @@ MATH_UNARY_HP_KERNEL_DEF(hlog);
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
MATH_UNARY_HP_TEST_DEF_IMPL(hlog, static_cast<float (*)(float)>(std::log),
|
||||
ULPValidatorBuilderFactory<float>(1));
|
||||
ULPValidatorBuilderFactory<Float16>(1));
|
||||
|
||||
MATH_UNARY_HP_KERNEL_DEF(h2log);
|
||||
|
||||
@@ -248,7 +248,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2log);
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
MATH_UNARY_HP_TEST_DEF_IMPL(h2log, static_cast<float (*)(float)>(std::log),
|
||||
ULPValidatorBuilderFactory<float>(1));
|
||||
ULPValidatorBuilderFactory<Float16>(1));
|
||||
|
||||
|
||||
MATH_UNARY_HP_KERNEL_DEF(hlog10);
|
||||
@@ -267,7 +267,7 @@ MATH_UNARY_HP_KERNEL_DEF(hlog10);
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
MATH_UNARY_HP_TEST_DEF_IMPL(hlog10, static_cast<float (*)(float)>(std::log10),
|
||||
ULPValidatorBuilderFactory<float>(2));
|
||||
ULPValidatorBuilderFactory<Float16>(1));
|
||||
|
||||
MATH_UNARY_HP_KERNEL_DEF(h2log10);
|
||||
|
||||
@@ -285,7 +285,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2log10);
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
MATH_UNARY_HP_TEST_DEF_IMPL(h2log10, static_cast<float (*)(float)>(std::log10),
|
||||
ULPValidatorBuilderFactory<float>(2));
|
||||
ULPValidatorBuilderFactory<Float16>(1));
|
||||
|
||||
|
||||
MATH_UNARY_HP_KERNEL_DEF(hlog2);
|
||||
@@ -304,7 +304,7 @@ MATH_UNARY_HP_KERNEL_DEF(hlog2);
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
MATH_UNARY_HP_TEST_DEF_IMPL(hlog2, static_cast<float (*)(float)>(std::log2),
|
||||
ULPValidatorBuilderFactory<float>(1));
|
||||
ULPValidatorBuilderFactory<Float16>(1));
|
||||
|
||||
MATH_UNARY_HP_KERNEL_DEF(h2log2);
|
||||
|
||||
@@ -322,7 +322,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2log2);
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
MATH_UNARY_HP_TEST_DEF_IMPL(h2log2, static_cast<float (*)(float)>(std::log2),
|
||||
ULPValidatorBuilderFactory<float>(1));
|
||||
ULPValidatorBuilderFactory<Float16>(1));
|
||||
|
||||
|
||||
MATH_UNARY_HP_KERNEL_DEF(hsqrt);
|
||||
@@ -341,7 +341,7 @@ MATH_UNARY_HP_KERNEL_DEF(hsqrt);
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
MATH_UNARY_HP_TEST_DEF_IMPL(hsqrt, static_cast<float (*)(float)>(std::sqrt),
|
||||
ULPValidatorBuilderFactory<float>(1));
|
||||
ULPValidatorBuilderFactory<Float16>(1));
|
||||
|
||||
MATH_UNARY_HP_KERNEL_DEF(h2sqrt);
|
||||
|
||||
@@ -359,7 +359,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2sqrt);
|
||||
* - HIP_VERSION >= 5.2
|
||||
*/
|
||||
MATH_UNARY_HP_TEST_DEF_IMPL(h2sqrt, static_cast<float (*)(float)>(std::sqrt),
|
||||
ULPValidatorBuilderFactory<float>(1));
|
||||
ULPValidatorBuilderFactory<Float16>(1));
|
||||
|
||||
|
||||
MATH_UNARY_HP_KERNEL_DEF(hceil);
|
||||
|
||||
@@ -187,7 +187,9 @@ template <typename T, typename... Ts> class MathTest {
|
||||
std::stringstream ss;
|
||||
ss << "Input value(s): " << std::scientific
|
||||
<< std::setprecision(std::numeric_limits<T>::max_digits10 - 1);
|
||||
((ss << " " << args), ...) << "\n" << actual_val << " ";
|
||||
((ss << " " << args), ...) << "\n"
|
||||
<< "Output value: " << actual_val << "\n"
|
||||
<< "Condition failed: ";
|
||||
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
@@ -25,6 +25,12 @@ THE SOFTWARE.
|
||||
#include <catch2/catch_all.hpp>
|
||||
#include <catch2/matchers/catch_matchers_floating_point.hpp>
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <iomanip>
|
||||
|
||||
#include "Float16.hh"
|
||||
|
||||
// Define a new MatcherBase class with a public 'describe' member function because
|
||||
// Catch::MatcherBase::describe is protected and thus can't be used via a pointer to
|
||||
// Catch::MatcherBase.
|
||||
@@ -61,6 +67,113 @@ template <typename T, typename Matcher> class ValidatorBase : public MatcherBase
|
||||
bool nan = false;
|
||||
};
|
||||
|
||||
struct Float16WithinUlpsMatcher : MatcherBase<Float16> {
|
||||
Float16WithinUlpsMatcher(Float16 target, uint64_t ulps) : m_target(target), m_ulps(ulps) {}
|
||||
|
||||
bool match(Float16 const& matchee) const override {
|
||||
// Comparison with NaN should always be false.
|
||||
// This way we can rule it out before getting into the ugly details
|
||||
if (__hisnan(matchee) || __hisnan(m_target)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
auto value_bits = convertFloat16toInt16(matchee);
|
||||
auto target_bits = convertFloat16toInt16(m_target);
|
||||
|
||||
// If signs differ, handle the special +0 vs -0 case explicitly.
|
||||
if ((value_bits < 0) != (target_bits < 0)) {
|
||||
return matchee == m_target;
|
||||
}
|
||||
|
||||
auto ulp_diff = std::abs(value_bits - target_bits);
|
||||
return static_cast<uint64_t>(ulp_diff) <= m_ulps;
|
||||
}
|
||||
|
||||
std::string describe() const override {
|
||||
std::stringstream ret;
|
||||
|
||||
ret << "is within " << m_ulps << " ULPs of ";
|
||||
|
||||
write(ret, m_target);
|
||||
ret << 'f';
|
||||
ret << " ([";
|
||||
|
||||
write(ret, step(m_target, -FLOAT16_MAX, m_ulps));
|
||||
ret << ", ";
|
||||
write(ret, step(m_target, FLOAT16_MAX, m_ulps));
|
||||
|
||||
ret << "])";
|
||||
|
||||
return ret.str();
|
||||
}
|
||||
|
||||
private:
|
||||
Float16 getNextAfter(Float16 from, Float16 direction) const {
|
||||
constexpr int16_t signbit_float16 = 0x8000;
|
||||
|
||||
// Encode inputs as 16-bit integers
|
||||
const int16_t from_bits = convertFloat16toInt16(from);
|
||||
const int16_t direction_bits = convertFloat16toInt16(direction);
|
||||
|
||||
// Special cases
|
||||
if (from_bits == direction_bits) return direction_bits;
|
||||
if (std::abs(from_bits) == static_cast<int16_t>(0) &&
|
||||
std::abs(direction_bits) == static_cast<int16_t>(0))
|
||||
return direction;
|
||||
|
||||
// Makes integer comparisons reflect numeric ordering across sign.
|
||||
const int16_t from_ordered = (from_bits < 0) ? signbit_float16 - from_bits : from_bits;
|
||||
const int16_t direction_ordered =
|
||||
(direction_bits < 0) ? signbit_float16 - direction_bits : direction_bits;
|
||||
|
||||
// Decide whether to move up or down by one ULP
|
||||
const int16_t step = (from_ordered < direction_ordered) ? 1 : -1;
|
||||
|
||||
// Take one step
|
||||
const int16_t after_step_ordered = from_ordered + step;
|
||||
|
||||
// Map back from ordered space to raw Float16 bits.
|
||||
int16_t next_bits =
|
||||
(after_step_ordered < 0) ? signbit_float16 - after_step_ordered : after_step_ordered;
|
||||
|
||||
// Handle boundary behavior for the most-negative edge case.
|
||||
if (from_ordered == -1 && (from_ordered < direction_ordered)) {
|
||||
next_bits = signbit_float16;
|
||||
}
|
||||
|
||||
return convertInt16toFloat16(next_bits);
|
||||
}
|
||||
|
||||
Float16 step(Float16 start, Float16 direction, uint64_t steps) const {
|
||||
Float16 result = start;
|
||||
for (uint64_t i = 0; i < steps; ++i) {
|
||||
result = getNextAfter(result, direction);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void write(std::ostream& out, Float16 num) const {
|
||||
const uint32_t float16_max_digits = 5;
|
||||
out << std::scientific << std::setprecision(float16_max_digits) << num;
|
||||
}
|
||||
|
||||
static Float16 convertInt16toFloat16(int16_t d) {
|
||||
Float16 i;
|
||||
std::memcpy(&i, &d, sizeof(int16_t));
|
||||
return i;
|
||||
}
|
||||
|
||||
static int16_t convertFloat16toInt16(Float16 d) {
|
||||
uint16_t i;
|
||||
std::memcpy(&i, &d, sizeof(Float16));
|
||||
return i;
|
||||
}
|
||||
|
||||
Float16 m_target;
|
||||
uint64_t m_ulps;
|
||||
};
|
||||
|
||||
|
||||
template <typename T> auto ULPValidatorBuilderFactory(int64_t ulps) {
|
||||
return [=](T target, auto&&...) {
|
||||
return std::make_unique<ValidatorBase<T, Catch::Matchers::WithinUlpsMatcher>>(
|
||||
@@ -68,6 +181,13 @@ template <typename T> auto ULPValidatorBuilderFactory(int64_t ulps) {
|
||||
};
|
||||
};
|
||||
|
||||
template <> inline auto ULPValidatorBuilderFactory<Float16>(int64_t ulps) {
|
||||
return [=](Float16 target, auto&&...) {
|
||||
return std::make_unique<ValidatorBase<Float16, Float16WithinUlpsMatcher>>(
|
||||
target, Float16WithinUlpsMatcher(target, ulps));
|
||||
};
|
||||
};
|
||||
|
||||
template <typename T> auto AbsValidatorBuilderFactory(double margin) {
|
||||
return [=](T target, auto&&...) {
|
||||
return std::make_unique<ValidatorBase<T, Catch::Matchers::WithinAbsMatcher>>(
|
||||
@@ -96,7 +216,7 @@ template <typename T> class EqValidator : public MatcherBase<T> {
|
||||
|
||||
std::string describe() const override {
|
||||
std::stringstream ss;
|
||||
ss << " is not equal to " << target_;
|
||||
ss << "is equal to " << target_;
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
|
||||
@@ -177,8 +177,8 @@ static bool checkhipMemPoolTrimTo(hipStream_t stream, int N, int dev = 0) {
|
||||
testObj.transferFromMempool(stream);
|
||||
testObj.freeDevBuf(stream);
|
||||
// verify and validate
|
||||
REQUIRE(true == testObj.validateResult());
|
||||
HIP_CHECK(hipStreamSynchronize(stream));
|
||||
REQUIRE(true == testObj.validateResult());
|
||||
}
|
||||
HIP_CHECK(hipMemPoolDestroy(mem_pool));
|
||||
return true;
|
||||
|
||||
@@ -494,7 +494,6 @@ HIP doesn't support the following CUDA functions/operators in ``cooperative_grou
|
||||
* ``synchronize``
|
||||
* ``memcpy_async``
|
||||
* ``wait`` and ``wait_prior``
|
||||
* ``barrier_arrive`` and ``barrier_wait``
|
||||
* ``invoke_one`` and ``invoke_one_broadcast``
|
||||
* ``reduce``
|
||||
* ``reduce_update_async`` and ``reduce_store_async``
|
||||
|
||||
@@ -9615,6 +9615,45 @@ hipError_t hipDestroySurfaceObject(hipSurfaceObject_t surfaceObject);
|
||||
/**
|
||||
* @}
|
||||
*/
|
||||
|
||||
/**
|
||||
* @brief Enable HIP runtime logging.
|
||||
*
|
||||
* This function enables the HIP runtime logging mechanism, allowing diagnostic
|
||||
* and trace information to be captured during HIP API execution.
|
||||
*
|
||||
* @returns #hipSuccess
|
||||
*
|
||||
* @see hipExtDisableLogging, hipExtSetLoggingParams
|
||||
*/
|
||||
hipError_t hipExtEnableLogging();
|
||||
/**
|
||||
* @brief Disable HIP runtime logging.
|
||||
*
|
||||
* This function disables the HIP runtime logging mechanism, stopping the capture
|
||||
* of diagnostic and trace information during HIP API execution.
|
||||
*
|
||||
* @returns #hipSuccess
|
||||
*
|
||||
* @see hipExtEnableLogging, hipExtSetLoggingParams
|
||||
*/
|
||||
hipError_t hipExtDisableLogging();
|
||||
/**
|
||||
* @brief Set HIP runtime logging parameters.
|
||||
*
|
||||
* This function configures the logging behavior of the HIP runtime, including
|
||||
* the verbosity level, buffer size, and which components to log.
|
||||
*
|
||||
* @param [in] log_level The logging verbosity level. Higher values produce more detailed output.
|
||||
* @param [in] log_size Reserved for future use. Currently not implemented.
|
||||
* @param [in] log_mask A bitmask specifying which HIP runtime components to log.
|
||||
*
|
||||
* @returns #hipSuccess, #hipErrorInvalidValue
|
||||
*
|
||||
* @see hipExtEnableLogging, hipExtDisableLogging
|
||||
*/
|
||||
hipError_t hipExtSetLoggingParams(size_t log_level, size_t log_size, size_t log_mask);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "c" */
|
||||
#endif
|
||||
|
||||
@@ -2,7 +2,7 @@ default_stages: [pre-commit]
|
||||
fail_fast: true
|
||||
repos:
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v5.0.0
|
||||
rev: v6.0.0
|
||||
hooks:
|
||||
- id: check-yaml
|
||||
- id: end-of-file-fixer
|
||||
@@ -12,7 +12,7 @@ repos:
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
# Ruff version. Check https://github.com/astral-sh/ruff-pre-commit#version-compatibility
|
||||
# for the latest ruff version supported by the hook.
|
||||
rev: v0.12.12
|
||||
rev: v0.14.11
|
||||
hooks:
|
||||
- id: ruff-check
|
||||
args: [--fix]
|
||||
|
||||
@@ -49,6 +49,10 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.
|
||||
|
||||
* Fix issue where counter collection data was empty when profiling workload which spawn multiple child processes
|
||||
|
||||
* Fix issue where dispatch filtering in a range (e.g. >2) was not working
|
||||
|
||||
* Fix redundant warnings for compute/memory partition not found for < MI 300 series GPUs by skipping partition checks
|
||||
|
||||
### Removed
|
||||
|
||||
* Removed "VL1 Lat" metric for AMD Instinct MI300 series GPUs, due to MI300 series not supporting TCP_TCP_LATENCY_sum counter.
|
||||
|
||||
@@ -4,14 +4,13 @@
|
||||
|
||||
ROCm Compute Profiler is a system performance profiling tool for machine
|
||||
learning/HPC workloads running on AMD MI GPUs. The tool presently
|
||||
targets usage on MI100, MI200, and MI300 accelerators.
|
||||
targets usage on MI100, MI200, MI300, and MI350 series accelerators.
|
||||
|
||||
* For more information on available features, installation steps, and
|
||||
workload profiling and analysis, please refer to the online
|
||||
[documentation](https://rocm.docs.amd.com/projects/rocprofiler-compute/en/latest/).
|
||||
|
||||
* ROCm Compute Profiler is an AMD open source research project and is not supported
|
||||
as part of the ROCm software stack. We welcome contributions and
|
||||
* ROCm Compute Profiler is an AMD open source tool that is part of the ROCm software stack. We welcome contributions and
|
||||
feedback from the community. Please see the
|
||||
[CONTRIBUTING.md](CONTRIBUTING.md) file for additional details on our
|
||||
contribution process.
|
||||
@@ -39,8 +38,8 @@ python3 -m pip install -r requirements.txt
|
||||
|
||||
## Testing
|
||||
|
||||
Populate the <usename> variable in `docker/docker-compose.customrocmtest.yml`.
|
||||
Populate the <rocm_build_image> variable in `docker/Dockerfile.customrocmtest` based on latest ROCm CI build information.
|
||||
Populate the <username> variable in `docker/docker-compose.customrocmtest.yml`.
|
||||
Populate the <tarball_name> variable in `docker/Dockerfile.customrocmtest` based on latest TheRock nightly build information.
|
||||
|
||||
To quickly get the environment (bash shell) for building and testing, run the following commands:
|
||||
* `cd docker`
|
||||
@@ -115,7 +114,7 @@ This software can be cited using a Zenodo
|
||||
style reference is provided below for convenience:
|
||||
|
||||
```
|
||||
@software{xiaomin_lu_2022_7314631
|
||||
@misc{xiaomin_lu_2022_7314631
|
||||
author = {Xiaomin Lu and
|
||||
Cole Ramos and
|
||||
Fei Zheng and
|
||||
@@ -124,12 +123,7 @@ style reference is provided below for convenience:
|
||||
Keith Lowery and
|
||||
Nicholas Curtis and
|
||||
Cristian Di Pietrantonio},
|
||||
title = {ROCm/rocprofiler-compute: v3.1.0 (12 February 2025)},
|
||||
month = February,
|
||||
year = 2025,
|
||||
publisher = {Zenodo},
|
||||
version = {v3.1.0},
|
||||
doi = {10.5281/zenodo.7314631},
|
||||
url = {https://doi.org/10.5281/zenodo.7314631}
|
||||
title = {rocprofiler-compute},
|
||||
url = {https://github.com/ROCm/rocm-systems/blob/develop/projects/rocprofiler-compute}
|
||||
}
|
||||
```
|
||||
|
||||
@@ -1,32 +1,52 @@
|
||||
# Use a base image
|
||||
FROM <rocm_build_image>
|
||||
FROM ubuntu:22.04
|
||||
|
||||
# Set the working directory
|
||||
WORKDIR /app
|
||||
# Install curl first (needed for ROCm download)
|
||||
RUN apt-get update && apt-get install -y curl
|
||||
|
||||
# Define the tarball name as a variable
|
||||
# Check https://therock-nightly-tarball.s3.amazonaws.com/index.html for latest builds
|
||||
# Use therock-dist-linux-gfx<arch>-dcgpu-<rocm-version>.tar.gz naming convention
|
||||
ARG TARBALL_NAME=<tarball_name>
|
||||
|
||||
# Install ROCm from TheRock Nightly build
|
||||
RUN mkdir -p /rocm && \
|
||||
curl -fLO https://therock-nightly-tarball.s3.amazonaws.com/${TARBALL_NAME} && \
|
||||
tar -xf ${TARBALL_NAME} -C /rocm && \
|
||||
rm ${TARBALL_NAME}
|
||||
|
||||
# Set environment variables for ROCm
|
||||
ENV PATH="/rocm/bin:${PATH}" \
|
||||
ROCM_PATH="/rocm" \
|
||||
LD_LIBRARY_PATH="/rocm/lib:/rocm/lib/rocm_sysdeps/lib:${LD_LIBRARY_PATH}" \
|
||||
HIP_DEVICE_LIB_PATH="/rocm/llvm/amdgcn/bitcode" \
|
||||
HIP_PLATFORM=amd
|
||||
|
||||
# Update package list and install prerequisites
|
||||
RUN apt-get update && apt-get install -y \
|
||||
software-properties-common cmake locales git curl \
|
||||
software-properties-common cmake locales git \
|
||||
&& add-apt-repository ppa:deadsnakes/ppa \
|
||||
&& apt-get update
|
||||
|
||||
# Allows running git commands in /app
|
||||
RUN git config --global --add safe.directory /app
|
||||
|
||||
# Generate the desired locale
|
||||
RUN locale-gen en_US.UTF-8
|
||||
|
||||
# Install Python 3.10 and pip
|
||||
RUN apt-get install -y python3.10 python3.10-venv python3.10-dev python3-pip libsqlite3-dev
|
||||
RUN python3.10 -m venv venv
|
||||
ENV PATH="venv/bin:$PATH"
|
||||
RUN python3.10 -m venv /venv
|
||||
ENV PATH="/venv/bin:$PATH"
|
||||
RUN python -m pip install --upgrade pip
|
||||
|
||||
# Install any dependencies specified in requirements.txt
|
||||
WORKDIR /app/projects/rocprofiler-compute
|
||||
# Install any rocprofiler-compute dependencies specified in requirements.txt
|
||||
COPY projects/rocprofiler-compute/requirements.txt /app/projects/rocprofiler-compute/requirements.txt
|
||||
COPY projects/rocprofiler-compute/requirements-test.txt /app/projects/rocprofiler-compute/requirements-test.txt
|
||||
RUN python -m pip install -r requirements.txt -r requirements-test.txt
|
||||
RUN python -m pip install -r /app/projects/rocprofiler-compute/requirements.txt -r /app/projects/rocprofiler-compute/requirements-test.txt
|
||||
|
||||
# Set the working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Allows running git commands in /app
|
||||
RUN git config --global --add safe.directory /app
|
||||
|
||||
# Run interactive bash shell
|
||||
CMD ["/bin/bash"]
|
||||
|
||||
Arquivo binário não exibido.
|
Antes Largura: | Altura: | Tamanho: 185 KiB Depois Largura: | Altura: | Tamanho: 254 KiB |
Arquivo binário não exibido.
|
Antes Largura: | Altura: | Tamanho: 34 KiB Depois Largura: | Altura: | Tamanho: 240 KiB |
@@ -15,7 +15,7 @@ This section provides an overview of ROCm Compute Profiler's CLI analysis featur
|
||||
* :ref:`Metric customization <cli-analysis-options>`: Isolate a subset of built-in metrics or build your own profiling configuration.
|
||||
|
||||
* :ref:`Filtering <cli-analysis-options>`: Hone in on a particular kernel, GPU ID, or dispatch ID via post-process filtering.
|
||||
|
||||
|
||||
* :ref:`Per-kernel roofline analysis <per-kernel-roofline>`: Detailed arithmetic intensity and performance analysis for individual kernels.
|
||||
|
||||
Run ``rocprof-compute analyze -h`` for more details.
|
||||
@@ -346,6 +346,7 @@ Show System Speed-of-Light and CS_Busy blocks only
|
||||
this case, ``1`` is the ID for System Speed-of-Light and ``5.1.0`` the ID for
|
||||
GPU Busy Cycles metric.
|
||||
|
||||
|
||||
Filter kernels
|
||||
First, list the top kernels in your application using `--list-stats`.
|
||||
|
||||
@@ -534,36 +535,40 @@ Analysis database example
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
$ rocprof-compute analyze --verbose --db test -p workloads/vmem/MI300X_A1 -p workloads/vmem1/MI300X_A1
|
||||
$ rocprof-compute analyze --verbose --output-name test --output-format db -p workloads/nbody/MI300X_A1 -p workloads/nbody1/MI300X_A1
|
||||
DEBUG Execution mode = analyze
|
||||
|
||||
__ _
|
||||
_ __ ___ ___ _ __ _ __ ___ / _| ___ ___ _ __ ___ _ __ _ _| |_ ___
|
||||
| '__/ _ \ / __| '_ \| '__/ _ \| |_ _____ / __/ _ \| '_ ` _ \| '_ \| | | | __/ _ \
|
||||
| | | (_) | (__| |_) | | | (_) | _|_____| (_| (_) | | | | | | |_) | |_| | || __/
|
||||
|_| \___/ \___| .__/|_| \___/|_| \___\___/|_| |_| |_| .__/ \__,_|\__\___|
|
||||
|_| |_|
|
||||
__ _
|
||||
_ __ ___ ___ _ __ _ __ ___ / _| ___ ___ _ __ ___ _ __ _ _| |_ ___
|
||||
| '__/ _ \ / __| '_ \| '__/ _ \| |_ _____ / __/ _ \| '_ ` _ \| '_ \| | | | __/ _ \
|
||||
| | | (_) | (__| |_) | | | (_) | _|_____| (_| (_) | | | | | | |_) | |_| | || __/
|
||||
|_| \___/ \___| .__/|_| \___/|_| \___\___/|_| |_| |_| .__/ \__,_|\__\___|
|
||||
|_| |_|
|
||||
|
||||
INFO Analysis mode = db
|
||||
DEBUG [omnisoc init]
|
||||
DEBUG [omnisoc init]
|
||||
DEBUG [analysis] prepping to do some analysis
|
||||
INFO [analysis] deriving rocprofiler-compute metrics...
|
||||
WARNING Roofline ceilings not found for /app/projects/rocprofiler-compute/workloads/vmem/MI300X_A1.
|
||||
WARNING Roofline ceilings not found for /app/projects/rocprofiler-compute/workloads/vmem1/MI300X_A1.
|
||||
WARNING PC sampling data not found for /app/projects/rocprofiler-compute/workloads/vmem/MI300X_A1.
|
||||
WARNING PC sampling data not found for /app/projects/rocprofiler-compute/workloads/vmem1/MI300X_A1.
|
||||
DEBUG Collected dispatch data
|
||||
DEBUG Applied analysis mode filters
|
||||
DEBUG Calculated dispatch data
|
||||
DEBUG Collected metrics data
|
||||
WARNING Failed to evaluate expression for 3.1.39 - Value: to_round((to_avg(
|
||||
INFO Analysis mode = db
|
||||
INFO ed45b0b189
|
||||
DEBUG [omnisoc init]
|
||||
INFO ed45b0b189
|
||||
DEBUG [omnisoc init]
|
||||
DEBUG [analysis] prepping to do some analysis
|
||||
INFO [analysis] deriving rocprofiler-compute metrics...
|
||||
DEBUG Collected roofline ceilings
|
||||
WARNING PC sampling data not found for /app/projects/rocprofiler-compute/workloads/nbody/MI300X_A1.
|
||||
WARNING PC sampling data not found for /app/projects/rocprofiler-compute/workloads/nbody1/MI300X_A1.
|
||||
DEBUG Collected dispatch data
|
||||
DEBUG Applied analysis mode filters
|
||||
DEBUG Calculated dispatch data
|
||||
DEBUG Collected metrics data
|
||||
WARNING Failed to evaluate expression for 3.1.39 - Value: to_round((to_avg(
|
||||
(pmc_df.get("pmc_perf_ACCUM") / pmc_df.get("SQC_ICACHE_REQ")).where((pmc_df.get("SQC_ICACHE_REQ") != 0), None)) * 100), 0) - unsupported operand type(s) for /: 'NoneType' and 'float'
|
||||
WARNING Failed to evaluate expression for 3.1.39 - Value: to_round((to_avg(
|
||||
WARNING Failed to evaluate expression for 3.1.39 - Value: to_round((to_avg(
|
||||
(pmc_df.get("pmc_perf_ACCUM") / pmc_df.get("SQC_ICACHE_REQ")).where((pmc_df.get("SQC_ICACHE_REQ") != 0), None)) * 100), 0) - unsupported operand type(s) for /: 'NoneType' and 'float'
|
||||
DEBUG Calculated metric values
|
||||
DEBUG Calculated roofline data points
|
||||
DEBUG [analysis] generating analysis
|
||||
DEBUG SQLite database initialized with name: test.db
|
||||
DEBUG Initialized database: test.db
|
||||
DEBUG Completed writing database
|
||||
DEBUG Calculated metric values
|
||||
DEBUG Calculated roofline data points
|
||||
DEBUG [analysis] generating analysis
|
||||
DEBUG SQLite database initialized with name: test.db
|
||||
DEBUG Initialized database: test.db
|
||||
INFO ed45b0b189
|
||||
INFO ed45b0b189
|
||||
DEBUG Completed writing database
|
||||
WARNING Created file: test.db
|
||||
|
||||
@@ -28,7 +28,9 @@ Launch the standalone GUI analyzer
|
||||
----------------------------------
|
||||
|
||||
To launch the ROCm Compute Profiler GUI analyzer, include the ``--gui`` flag with your
|
||||
desired analysis command. For example:
|
||||
desired analysis command.
|
||||
|
||||
For example:
|
||||
|
||||
.. code-block:: shell-session
|
||||
|
||||
|
||||
@@ -386,26 +386,24 @@ class OmniAnalyze_Base:
|
||||
sys.exit(0)
|
||||
|
||||
# Ensure analysis output does not overwrite existing files
|
||||
if not args.output_name:
|
||||
return
|
||||
if args.output_name:
|
||||
if not re.match(r"^[A-Za-z0-9_-]+$", args.output_name):
|
||||
console_error(
|
||||
"analysis",
|
||||
"Analysis output file/folder name must "
|
||||
"contain only alphanumeric characters "
|
||||
"or underscores (_), hyphens (-).",
|
||||
)
|
||||
|
||||
if not re.match(r"^[A-Za-z0-9_-]+$", args.output_name):
|
||||
console_error(
|
||||
"analysis",
|
||||
"Analysis output file/folder name must "
|
||||
"contain only alphanumeric characters "
|
||||
"or underscores (_), hyphens (-).",
|
||||
)
|
||||
path_to_check = args.output_name
|
||||
if args.output_format in ("txt", "db"):
|
||||
path_to_check += f".{args.output_format}"
|
||||
|
||||
path_to_check = args.output_name
|
||||
if args.output_format in ("txt", "db"):
|
||||
path_to_check += f".{args.output_format}"
|
||||
|
||||
if Path(path_to_check).exists():
|
||||
console_error(
|
||||
f"Analysis output file/folder {path_to_check} already exists. "
|
||||
"Please choose a different name."
|
||||
)
|
||||
if Path(path_to_check).exists():
|
||||
console_error(
|
||||
f"Analysis output file/folder {path_to_check} already exists. "
|
||||
"Please choose a different name."
|
||||
)
|
||||
|
||||
# Check if any kernel's counters are missing due to iteration multiplexing
|
||||
if (
|
||||
|
||||
@@ -101,7 +101,9 @@ class db_analysis(OmniAnalyze_Base):
|
||||
Database.init(db_name)
|
||||
console_debug(f"Initialized database: {db_name}")
|
||||
|
||||
# Iterate over all workloads
|
||||
for workload_path in self._runs.keys():
|
||||
# Add workload
|
||||
workload_obj = orm.Workload(
|
||||
name=workload_path.split("/")[-2],
|
||||
sub_name=workload_path.split("/")[-1],
|
||||
@@ -113,38 +115,9 @@ class db_analysis(OmniAnalyze_Base):
|
||||
)
|
||||
Database.get_session().add(workload_obj)
|
||||
|
||||
for pc_sample in self._pc_sampling_data_per_workload.get(
|
||||
workload_path, pd.DataFrame()
|
||||
).itertuples():
|
||||
Database.get_session().add(
|
||||
orm.PCsampling(
|
||||
source=pc_sample.source_line,
|
||||
instruction=pc_sample.instruction,
|
||||
count=pc_sample.count,
|
||||
kernel_name=pc_sample.kernel_name,
|
||||
offset=pc_sample.offset,
|
||||
count_issue=pc_sample.count_issued,
|
||||
count_stall=pc_sample.count_stalled,
|
||||
stall_reason=pc_sample.stall_reason,
|
||||
workload=workload_obj,
|
||||
)
|
||||
)
|
||||
|
||||
for roofline_data in self._roofline_data_per_workload.get(
|
||||
workload_path, pd.DataFrame()
|
||||
).itertuples():
|
||||
Database.get_session().add(
|
||||
orm.RooflineData(
|
||||
kernel_name=roofline_data.kernel_name,
|
||||
total_flops=roofline_data.total_flops,
|
||||
l1_cache_data=roofline_data.l1_cache_data,
|
||||
l2_cache_data=roofline_data.l2_cache_data,
|
||||
hbm_cache_data=roofline_data.hbm_cache_data,
|
||||
workload=workload_obj,
|
||||
)
|
||||
)
|
||||
|
||||
# Add kernel
|
||||
kernel_objs: dict[str, orm.Kernel] = {}
|
||||
|
||||
for dispatch in self._dispatch_data_per_workload.get(
|
||||
workload_path, pd.DataFrame()
|
||||
).itertuples():
|
||||
@@ -167,44 +140,101 @@ class db_analysis(OmniAnalyze_Base):
|
||||
)
|
||||
)
|
||||
|
||||
# Optimize: Pre-group values by (metric_id, kernel_name) for O(1) lookups
|
||||
values_df = self._values_data_per_workload.get(
|
||||
workload_path, pd.DataFrame()
|
||||
)
|
||||
values_grouped = {}
|
||||
if not values_df.empty:
|
||||
for value in values_df.itertuples():
|
||||
key = (value.metric_id, value.kernel_name)
|
||||
if key not in values_grouped:
|
||||
values_grouped[key] = []
|
||||
values_grouped[key].append(value)
|
||||
|
||||
for metric in self._metrics_info_data_per_workload.get(
|
||||
# Add roofline data points
|
||||
for roofline_data in self._roofline_data_per_workload.get(
|
||||
workload_path, pd.DataFrame()
|
||||
).itertuples():
|
||||
for kernel_name in kernel_objs.keys():
|
||||
metric_obj = orm.Metric(
|
||||
name=metric.name,
|
||||
metric_id=metric.metric_id,
|
||||
description=metric.description,
|
||||
unit=metric.unit,
|
||||
table_name=metric.table_name,
|
||||
sub_table_name=metric.sub_table_name,
|
||||
kernel=kernel_objs[kernel_name],
|
||||
if roofline_data.kernel_name not in kernel_objs:
|
||||
console_warning(
|
||||
f"Kernel {roofline_data.kernel_name} from roofline data "
|
||||
"not found in dispatch data. Skipping roofline entry."
|
||||
)
|
||||
Database.get_session().add(metric_obj)
|
||||
continue
|
||||
Database.get_session().add(
|
||||
orm.RooflineData(
|
||||
total_flops=roofline_data.total_flops,
|
||||
l1_cache_data=roofline_data.l1_cache_data,
|
||||
l2_cache_data=roofline_data.l2_cache_data,
|
||||
hbm_cache_data=roofline_data.hbm_cache_data,
|
||||
kernel=kernel_objs[roofline_data.kernel_name],
|
||||
)
|
||||
)
|
||||
|
||||
# Direct lookup instead of iterating through all values
|
||||
key = (metric.metric_id, kernel_name)
|
||||
for value in values_grouped.get(key, []):
|
||||
Database.get_session().add(
|
||||
orm.Value(
|
||||
metric=metric_obj,
|
||||
value_name=value.value_name,
|
||||
value=value.value,
|
||||
)
|
||||
# Add pc sampling data
|
||||
for pc_sample in self._pc_sampling_data_per_workload.get(
|
||||
workload_path, pd.DataFrame()
|
||||
).itertuples():
|
||||
if pc_sample.kernel_name not in kernel_objs:
|
||||
console_warning(
|
||||
f"Kernel {pc_sample.kernel_name} from PC sampling data "
|
||||
"not found in dispatch data. Skipping PC sampling entry."
|
||||
)
|
||||
continue
|
||||
Database.get_session().add(
|
||||
orm.PCsampling(
|
||||
source=pc_sample.source_line,
|
||||
instruction=pc_sample.instruction,
|
||||
count=pc_sample.count,
|
||||
offset=pc_sample.offset,
|
||||
count_issue=pc_sample.count_issued,
|
||||
count_stall=pc_sample.count_stalled,
|
||||
stall_reason=pc_sample.stall_reason,
|
||||
kernel=kernel_objs[pc_sample.kernel_name],
|
||||
)
|
||||
)
|
||||
|
||||
# Add metrics and values - iterate on values, create metrics as needed
|
||||
metrics_info_dict = {
|
||||
row.metric_id: row
|
||||
for row in self._metrics_info_data_per_workload.get(
|
||||
workload_path, pd.DataFrame()
|
||||
).itertuples()
|
||||
}
|
||||
metric_objs: dict[str, orm.MetricDefinition] = {}
|
||||
|
||||
for value in self._values_data_per_workload.get(
|
||||
workload_path, pd.DataFrame()
|
||||
).itertuples():
|
||||
# Check if kernel exists
|
||||
if value.kernel_name not in kernel_objs:
|
||||
console_warning(
|
||||
f"Kernel {value.kernel_name} from values data "
|
||||
"not found in dispatch data. Skipping metric value."
|
||||
)
|
||||
continue
|
||||
|
||||
# Create or reuse metric object
|
||||
if value.metric_id not in metric_objs:
|
||||
# Fetch metric info
|
||||
if value.metric_id not in metrics_info_dict:
|
||||
console_warning(
|
||||
f"Metric {value.metric_id} from values data "
|
||||
"not found in metrics info. Skipping metric value."
|
||||
)
|
||||
continue
|
||||
metric_info = metrics_info_dict[value.metric_id]
|
||||
metric_objs[value.metric_id] = orm.MetricDefinition(
|
||||
name=metric_info.name,
|
||||
metric_id=metric_info.metric_id,
|
||||
description=metric_info.description,
|
||||
unit=metric_info.unit,
|
||||
table_name=metric_info.table_name,
|
||||
sub_table_name=metric_info.sub_table_name,
|
||||
workload=workload_obj,
|
||||
)
|
||||
Database.get_session().add(metric_objs[value.metric_id])
|
||||
|
||||
# Add value
|
||||
Database.get_session().add(
|
||||
orm.MetricValue(
|
||||
metric=metric_objs[value.metric_id],
|
||||
kernel=kernel_objs[value.kernel_name],
|
||||
value_name=value.value_name,
|
||||
value=value.value,
|
||||
)
|
||||
)
|
||||
|
||||
# Add metadata
|
||||
version = get_version(rocprof_compute_home)
|
||||
Database.get_session().add(
|
||||
orm.Metadata(
|
||||
|
||||
@@ -51,7 +51,6 @@ class webui_analysis(OmniAnalyze_Base):
|
||||
self.app = dash.Dash(
|
||||
__name__, title=PROJECT_NAME, external_stylesheets=[dbc.themes.CYBORG]
|
||||
)
|
||||
self.dest_dir = str(Path(args.path[0][0]).absolute().resolve())
|
||||
self.arch: Optional[str] = None
|
||||
|
||||
self.__hidden_sections = ["Memory Chart"]
|
||||
@@ -90,6 +89,7 @@ class webui_analysis(OmniAnalyze_Base):
|
||||
kernel_top_df = base_data.dfs[1]
|
||||
for kernel_id in base_data.filter_kernel_ids:
|
||||
filt_kernel_names.append(str(kernel_top_df.loc[kernel_id, "Kernel_Name"]))
|
||||
input_filters["kernel"] = filt_kernel_names
|
||||
|
||||
# setup app layout
|
||||
from utils.gui_components.header import get_header
|
||||
@@ -338,6 +338,7 @@ class webui_analysis(OmniAnalyze_Base):
|
||||
)
|
||||
|
||||
args = self.get_args()
|
||||
self.dest_dir = str(Path(args.path[0][0]).absolute().resolve())
|
||||
|
||||
# create 'mega dataframe'
|
||||
self._runs[self.dest_dir].raw_pmc = file_io.create_df_pmc(
|
||||
|
||||
@@ -531,12 +531,15 @@ class RocProfCompute_Base:
|
||||
and not args.attach_pid
|
||||
):
|
||||
# Use native counter collection tool
|
||||
# Use lib* glob pattern to handle CMAKE_INSTALL_LIBDIR variations
|
||||
# (lib, lib64, lib32, etc. depending on distribution)
|
||||
native_tool_base_path = Path(sys.argv[0]).resolve().parents[2]
|
||||
native_tool_glob_pattern = (
|
||||
"lib*/rocprofiler-compute/librocprofiler-compute-tool.so"
|
||||
)
|
||||
try:
|
||||
native_tool_path = str(
|
||||
Path(sys.argv[0]).resolve().parents[2]
|
||||
/ "lib"
|
||||
/ "rocprofiler-compute"
|
||||
/ "librocprofiler-compute-tool.so"
|
||||
next(native_tool_base_path.glob(native_tool_glob_pattern))
|
||||
)
|
||||
except Exception as e:
|
||||
console_debug(
|
||||
@@ -552,6 +555,7 @@ class RocProfCompute_Base:
|
||||
)
|
||||
/ "librocprofiler-compute-tool.so"
|
||||
)
|
||||
native_tool_cpp_path = Path(__file__).resolve().parents[1] / "lib"
|
||||
link_libraries = ("rocprofiler-sdk",)
|
||||
build_command = (
|
||||
# Create shared object
|
||||
@@ -564,10 +568,10 @@ class RocProfCompute_Base:
|
||||
# rocprofiler sdk library path
|
||||
f"-L {str(Path(args.rocprofiler_sdk_tool_path).parent.parent)} "
|
||||
# native tool source files (tool.cpp and helper.cpp)
|
||||
f"{str(Path(__file__).parent.parent)}/"
|
||||
"lib/rocprofiler_compute_tool.cpp "
|
||||
f"{str(Path(__file__).parent.parent)}/"
|
||||
"lib/helper.cpp "
|
||||
f"{native_tool_cpp_path}/"
|
||||
"rocprofiler_compute_tool.cpp "
|
||||
f"{native_tool_cpp_path}/"
|
||||
"helper.cpp "
|
||||
# temporary shared object for native tool
|
||||
f"-o {native_tool_path}"
|
||||
)
|
||||
@@ -575,7 +579,15 @@ class RocProfCompute_Base:
|
||||
success, output = capture_subprocess_output(shlex.split(build_command))
|
||||
console_debug(f"Build output: {output}")
|
||||
if not success:
|
||||
console_error("Failed to build native counter collection tool.")
|
||||
console_error(
|
||||
"Failed to use native counter collection tool.\n"
|
||||
"Could not find pre-built .so file at: "
|
||||
f"{native_tool_base_path / native_tool_glob_pattern}\n"
|
||||
"Could not find source .cpp files in folder: "
|
||||
f"{native_tool_cpp_path}\n"
|
||||
"Please ensure the native tool library is installed "
|
||||
"or source files are present."
|
||||
)
|
||||
|
||||
if self.__profiler == "rocprofiler-sdk":
|
||||
options = self.get_profiler_options(native_tool_path=native_tool_path)
|
||||
|
||||
@@ -92,7 +92,6 @@ class rocprofiler_sdk_profiler(RocProfCompute_Base):
|
||||
/ "librocprofiler-sdk-rocattach.so"
|
||||
)
|
||||
options.update({
|
||||
"ROCPROF_ATTACH_TOOL_LIBRARY": rocprofiler_sdk_tool_path,
|
||||
"ROCPROF_ATTACH_LIBRARY": rocprofiler_attach_library_path,
|
||||
"ROCPROF_ATTACH_PID": args.attach_pid,
|
||||
})
|
||||
|
||||
@@ -45,7 +45,7 @@ from sqlalchemy.sql import Select
|
||||
from utils.logger import console_debug, console_error
|
||||
|
||||
PREFIX = "compute_"
|
||||
SCHEMA_VERSION = "1.1.0"
|
||||
SCHEMA_VERSION = "1.2.0"
|
||||
|
||||
|
||||
Base = declarative_base()
|
||||
@@ -63,18 +63,16 @@ class Workload(Base):
|
||||
|
||||
# Workload can have multiple kernels
|
||||
kernels = relationship("Kernel", back_populates="workload")
|
||||
# Workload can have multiple roofline data points
|
||||
roofline_data_points = relationship("RooflineData", back_populates="workload")
|
||||
# Workload can have multiple pc_sampling values
|
||||
pc_sampling_values = relationship("PCsampling", back_populates="workload")
|
||||
# Workload can have multiple metric definitions
|
||||
metric_definitions = relationship("MetricDefinition", back_populates="workload")
|
||||
|
||||
|
||||
class Metric(Base):
|
||||
__tablename__ = f"{PREFIX}metric"
|
||||
class MetricDefinition(Base):
|
||||
__tablename__ = f"{PREFIX}metric_definition"
|
||||
|
||||
metric_uuid = Column(Integer, primary_key=True)
|
||||
kernel_uuid = Column(
|
||||
Integer, ForeignKey(f"{PREFIX}kernel.kernel_uuid"), nullable=False
|
||||
workload_id = Column(
|
||||
Integer, ForeignKey(f"{PREFIX}workload.workload_id"), nullable=False
|
||||
)
|
||||
name = Column(String) # e.g. Wavefronts Num
|
||||
metric_id = Column(String) # e.g. 4.1.3
|
||||
@@ -83,27 +81,26 @@ class Metric(Base):
|
||||
sub_table_name = Column(String) # e.g. Wavefront stats
|
||||
unit = Column(String) # e.g. Gbps
|
||||
|
||||
# Metric can have one kernel
|
||||
kernel = relationship("Kernel", back_populates="metrics")
|
||||
# Metric can have multiple values
|
||||
values = relationship("Value", back_populates="metric")
|
||||
# Metric can have one workload
|
||||
workload = relationship("Workload", back_populates="metric_definitions")
|
||||
# Metric can have multiple metric values
|
||||
metric_values = relationship("MetricValue", back_populates="metric")
|
||||
|
||||
|
||||
class RooflineData(Base):
|
||||
__tablename__ = f"{PREFIX}roofline_data"
|
||||
|
||||
roofline_uuid = Column(Integer, primary_key=True)
|
||||
workload_id = Column(
|
||||
Integer, ForeignKey(f"{PREFIX}workload.workload_id"), nullable=False
|
||||
kernel_uuid = Column(
|
||||
Integer, ForeignKey(f"{PREFIX}kernel.kernel_uuid"), nullable=False
|
||||
)
|
||||
kernel_name = Column(String)
|
||||
total_flops = Column(Float)
|
||||
l1_cache_data = Column(Float)
|
||||
l2_cache_data = Column(Float)
|
||||
hbm_cache_data = Column(Float)
|
||||
|
||||
# Roofline data point can have one workload
|
||||
workload = relationship("Workload", back_populates="roofline_data_points")
|
||||
# Roofline data point can have one kernel
|
||||
kernel = relationship("Kernel", back_populates="roofline_data_points")
|
||||
|
||||
|
||||
class Dispatch(Base):
|
||||
@@ -135,42 +132,50 @@ class Kernel(Base):
|
||||
workload = relationship("Workload", back_populates="kernels")
|
||||
# Kernel can have multiple dispatches
|
||||
dispatches = relationship("Dispatch", back_populates="kernel")
|
||||
# Kernel can have multiple metrics
|
||||
metrics = relationship("Metric", back_populates="kernel")
|
||||
# Kernel can have multiple metric values
|
||||
metric_values = relationship("MetricValue", back_populates="kernel")
|
||||
# Kernel can have multiple roofline data points
|
||||
roofline_data_points = relationship("RooflineData", back_populates="kernel")
|
||||
# Kernel can have multiple pc_sampling values
|
||||
pc_sampling_values = relationship("PCsampling", back_populates="kernel")
|
||||
|
||||
|
||||
class PCsampling(Base):
|
||||
__tablename__ = f"{PREFIX}pcsampling"
|
||||
|
||||
pc_sampling_uuid = Column(Integer, primary_key=True)
|
||||
workload_id = Column(
|
||||
Integer, ForeignKey(f"{PREFIX}workload.workload_id"), nullable=False
|
||||
kernel_uuid = Column(
|
||||
Integer, ForeignKey(f"{PREFIX}kernel.kernel_uuid"), nullable=False
|
||||
)
|
||||
source = Column(String)
|
||||
instruction = Column(String)
|
||||
count = Column(Integer)
|
||||
kernel_name = Column(String)
|
||||
offset = Column(Integer)
|
||||
count_issue = Column(Integer)
|
||||
count_stall = Column(Integer)
|
||||
stall_reason = Column(JSON)
|
||||
|
||||
# PCsampling can have one workload
|
||||
workload = relationship("Workload", back_populates="pc_sampling_values")
|
||||
# PCsampling can have one kernel
|
||||
kernel = relationship("Kernel", back_populates="pc_sampling_values")
|
||||
|
||||
|
||||
class Value(Base):
|
||||
__tablename__ = f"{PREFIX}value"
|
||||
class MetricValue(Base):
|
||||
__tablename__ = f"{PREFIX}metric_value"
|
||||
|
||||
value_uuid = Column(Integer, primary_key=True)
|
||||
metric_uuid = Column(
|
||||
Integer, ForeignKey(f"{PREFIX}metric.metric_uuid"), nullable=False
|
||||
Integer, ForeignKey(f"{PREFIX}metric_definition.metric_uuid"), nullable=False
|
||||
)
|
||||
kernel_uuid = Column(
|
||||
Integer, ForeignKey(f"{PREFIX}kernel.kernel_uuid"), nullable=False
|
||||
)
|
||||
value_name = Column(String) # e.g. min, max, avg
|
||||
value = Column(Float) # e.g. 123.45
|
||||
|
||||
# Value can have one metric
|
||||
metric = relationship("Metric", back_populates="values")
|
||||
metric = relationship("MetricDefinition", back_populates="metric_values")
|
||||
# Value can have one kernel
|
||||
kernel = relationship("Kernel", back_populates="metric_values")
|
||||
|
||||
|
||||
class Metadata(Base):
|
||||
@@ -250,11 +255,20 @@ def get_views() -> list[TextClause]:
|
||||
|
||||
views: dict[str, Select[Any]] = {
|
||||
"kernel_view": select(
|
||||
Kernel.kernel_uuid.label("kernel_uuid"),
|
||||
Kernel.workload_id.label("workload_id"),
|
||||
Workload.name.label("workload_name"),
|
||||
Kernel.kernel_name,
|
||||
func.count(Dispatch.dispatch_id).label("dispatch_count"),
|
||||
func.sum(Dispatch.end_timestamp - Dispatch.start_timestamp).label(
|
||||
"duration_ns_sum"
|
||||
),
|
||||
func.min(Dispatch.end_timestamp - Dispatch.start_timestamp).label(
|
||||
"duration_ns_min"
|
||||
),
|
||||
func.max(Dispatch.end_timestamp - Dispatch.start_timestamp).label(
|
||||
"duration_ns_max"
|
||||
),
|
||||
median_calc.c.duration_ns_median,
|
||||
func.avg(Dispatch.end_timestamp - Dispatch.start_timestamp).label(
|
||||
"duration_ns_mean"
|
||||
@@ -262,24 +276,31 @@ def get_views() -> list[TextClause]:
|
||||
)
|
||||
.select_from(Dispatch)
|
||||
.join(Kernel, Dispatch.kernel_uuid == Kernel.kernel_uuid)
|
||||
.join(Workload, Kernel.workload_id == Workload.workload_id)
|
||||
.join(median_calc.subquery(), Kernel.kernel_name == median_calc.c.kernel_name)
|
||||
.group_by(Kernel.kernel_name),
|
||||
.group_by(
|
||||
Kernel.kernel_uuid, Kernel.workload_id, Workload.name, Kernel.kernel_name
|
||||
),
|
||||
"metric_view": select(
|
||||
Workload.workload_id.label("workload_id"),
|
||||
Workload.name.label("workload_name"),
|
||||
Kernel.kernel_uuid.label("kernel_uuid"),
|
||||
Kernel.kernel_name,
|
||||
Metric.name.label("metric_name"),
|
||||
Metric.metric_id,
|
||||
Metric.description,
|
||||
Metric.table_name,
|
||||
Metric.sub_table_name,
|
||||
Metric.unit,
|
||||
Value.value_name,
|
||||
Value.value,
|
||||
MetricDefinition.metric_uuid.label("metric_uuid"),
|
||||
MetricDefinition.name.label("metric_name"),
|
||||
MetricDefinition.metric_id,
|
||||
MetricDefinition.description,
|
||||
MetricDefinition.table_name,
|
||||
MetricDefinition.sub_table_name,
|
||||
MetricDefinition.unit,
|
||||
MetricValue.value_uuid.label("value_uuid"),
|
||||
MetricValue.value_name,
|
||||
MetricValue.value,
|
||||
)
|
||||
.select_from(Metric)
|
||||
.join(Kernel, Metric.kernel_uuid == Kernel.kernel_uuid)
|
||||
.join(Value, Metric.metric_uuid == Value.metric_uuid)
|
||||
.join(Workload, Kernel.workload_id == Workload.workload_id),
|
||||
.select_from(MetricDefinition)
|
||||
.join(Workload, MetricDefinition.workload_id == Workload.workload_id)
|
||||
.join(MetricValue, MetricDefinition.metric_uuid == MetricValue.metric_uuid)
|
||||
.join(Kernel, MetricValue.kernel_uuid == Kernel.kernel_uuid),
|
||||
}
|
||||
|
||||
return [
|
||||
|
||||
@@ -61,7 +61,7 @@ def multi_bar_chart(
|
||||
|
||||
def create_instruction_mix_bar_chart(display_df: pd.DataFrame, df_unit: str) -> px.bar:
|
||||
display_df = display_df.copy()
|
||||
display_df["Avg"] = display_df["Avg"].apply(lambda x: int(x) if x != "" else 0)
|
||||
display_df["Avg"] = display_df["Avg"].apply(lambda x: int(x) if x != "N/A" else 0)
|
||||
|
||||
return px.bar(
|
||||
display_df,
|
||||
@@ -78,7 +78,7 @@ def create_multi_bar_charts(
|
||||
display_df: pd.DataFrame, table_id: int, df_unit: str
|
||||
) -> list[px.bar]:
|
||||
display_df = display_df.copy()
|
||||
display_df["Avg"] = display_df["Avg"].apply(lambda x: int(x) if x != "" else 0)
|
||||
display_df["Avg"] = display_df["Avg"].apply(lambda x: int(x) if x != "N/A" else 0)
|
||||
|
||||
nested_bar = multi_bar_chart(table_id, display_df)
|
||||
charts = []
|
||||
@@ -103,7 +103,9 @@ def create_multi_bar_charts(
|
||||
|
||||
def create_sol_charts(display_df: pd.DataFrame, table_id: int) -> list[px.bar]:
|
||||
display_df = display_df.copy()
|
||||
display_df["Avg"] = display_df["Avg"].apply(lambda x: float(x) if x != "" else 0.0)
|
||||
display_df["Avg"] = display_df["Avg"].apply(
|
||||
lambda x: float(x) if x != "N/A" else 0.0
|
||||
)
|
||||
|
||||
charts = []
|
||||
|
||||
@@ -144,7 +146,7 @@ def create_sol_charts(display_df: pd.DataFrame, table_id: int) -> list[px.bar]:
|
||||
elif table_id == 1101:
|
||||
# Special formatting reference 'Pct of Peak' value
|
||||
display_df["Pct of Peak"] = display_df["Pct of Peak"].apply(
|
||||
lambda x: float(x) if x != "" else 0.0
|
||||
lambda x: float(x) if x != "N/A" else 0.0
|
||||
)
|
||||
charts.append(
|
||||
px.bar(
|
||||
|
||||
@@ -1290,6 +1290,8 @@ def apply_dispatch_filter(df: pd.DataFrame, workload: schema.Workload) -> pd.Dat
|
||||
# NB: support ignoring the 1st n dispatched execution by '> n'
|
||||
# The better way may be parsing python slice string
|
||||
for dispatch_id in workload.filter_dispatch_ids:
|
||||
if isinstance(dispatch_id, str) and ">" in dispatch_id:
|
||||
dispatch_id = re.match(r"\>\s*(\d+)", dispatch_id).group(1)
|
||||
if int(dispatch_id) >= len(df): # subtract 2 bc of the two header rows
|
||||
console_error("analysis", f"{dispatch_id} is an invalid dispatch id.")
|
||||
|
||||
@@ -1297,7 +1299,7 @@ def apply_dispatch_filter(df: pd.DataFrame, workload: schema.Workload) -> pd.Dat
|
||||
isinstance(workload.filter_dispatch_ids[0], str)
|
||||
and ">" in workload.filter_dispatch_ids[0]
|
||||
):
|
||||
dispatch_match = re.match(r"\> (\d+)", workload.filter_dispatch_ids[0])
|
||||
dispatch_match = re.match(r"\>\s*(\d+)", workload.filter_dispatch_ids[0])
|
||||
df = df[
|
||||
df[schema.PMC_PERF_FILE_PREFIX]["Dispatch_ID"]
|
||||
> int(dispatch_match.group(1))
|
||||
|
||||
@@ -174,15 +174,15 @@ def generate_machine_specs(
|
||||
##########################################
|
||||
machine_info = extract_machine_info()
|
||||
|
||||
# FIXME: use device
|
||||
# Load amd-smi data
|
||||
gpu_info = extract_gpu_info()
|
||||
|
||||
##########################################
|
||||
## B. SoC Specs
|
||||
##########################################
|
||||
soc_info = extract_soc_info()
|
||||
|
||||
# FIXME: use device
|
||||
# Load amd-smi data
|
||||
gpu_info = extract_gpu_info(gpu_arch=soc_info["gpu_arch"])
|
||||
|
||||
# Combine all specifications
|
||||
with amdsmi_ctx():
|
||||
specs = MachineSpecs(
|
||||
@@ -269,7 +269,16 @@ def extract_machine_info() -> dict[str, Any]:
|
||||
|
||||
|
||||
@demarcate
|
||||
def extract_gpu_info() -> dict[str, Any]:
|
||||
def extract_gpu_info(gpu_arch: Optional[str]) -> dict[str, Any]:
|
||||
# Partition is only supported on >= MI 300 series
|
||||
# (gpu_arch should be gfx940 or higher for MI300+)
|
||||
is_partition_supported = False
|
||||
if gpu_arch and gpu_arch.startswith("gfx") and len(gpu_arch) >= 6:
|
||||
try:
|
||||
is_partition_supported = int(gpu_arch[3:6], 16) >= 0x940
|
||||
except ValueError:
|
||||
pass # Invalid hex string, keep is_partition_supported as False
|
||||
|
||||
result: dict[str, Optional[str]] = {
|
||||
"vbios": None,
|
||||
"compute_partition": None,
|
||||
@@ -278,17 +287,22 @@ def extract_gpu_info() -> dict[str, Any]:
|
||||
|
||||
with amdsmi_ctx():
|
||||
result["vbios"] = get_gpu_vbios_part_number()
|
||||
result["compute_partition"] = get_gpu_compute_partition()
|
||||
result["memory_partition"] = get_gpu_memory_partition()
|
||||
if is_partition_supported:
|
||||
result["compute_partition"] = get_gpu_compute_partition()
|
||||
result["memory_partition"] = get_gpu_memory_partition()
|
||||
else:
|
||||
result["compute_partition"] = "N/A"
|
||||
result["memory_partition"] = "N/A"
|
||||
|
||||
# Apply defaults and warnings
|
||||
if result["compute_partition"] == "N/A" or not result["compute_partition"]:
|
||||
console_warning("Cannot detect accelerator partition from amd-smi.")
|
||||
console_warning("Applying default accelerator partition: SPX")
|
||||
result["compute_partition"] = "SPX"
|
||||
if is_partition_supported:
|
||||
if result["compute_partition"] == "N/A" or not result["compute_partition"]:
|
||||
console_warning("Cannot detect accelerator partition from amd-smi.")
|
||||
console_warning("Applying default accelerator partition: SPX")
|
||||
result["compute_partition"] = "SPX"
|
||||
|
||||
if result["memory_partition"] == "N/A" or not result["memory_partition"]:
|
||||
console_warning("Cannot detect memory partition from amd-smi.")
|
||||
if result["memory_partition"] == "N/A" or not result["memory_partition"]:
|
||||
console_warning("Cannot detect memory partition from amd-smi.")
|
||||
|
||||
console_debug(
|
||||
f"vbios is {result['vbios']}, compute partition is "
|
||||
|
||||
@@ -235,6 +235,101 @@ def detect_rocprof(args: argparse.Namespace) -> str:
|
||||
return rocprof_cmd
|
||||
|
||||
|
||||
def perform_attach_detach(new_env: dict[str, str], options: dict[str, Any]) -> None:
|
||||
@contextmanager
|
||||
def temporary_env(env_vars: dict[str, str]) -> Generator[None, None, None]:
|
||||
"""
|
||||
Temporarily change the environment variable of this application.
|
||||
"""
|
||||
original_env = os.environ.copy()
|
||||
os.environ.update({k: str(v) for k, v in env_vars.items()})
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
os.environ.clear()
|
||||
os.environ.update(original_env)
|
||||
|
||||
with temporary_env(new_env):
|
||||
libname = options["ROCPROF_ATTACH_LIBRARY"]
|
||||
|
||||
try:
|
||||
c_lib = ctypes.CDLL(libname)
|
||||
if c_lib is None:
|
||||
console_error(f"Error opening {libname}")
|
||||
except Exception as e:
|
||||
console_error(f"Error loading {libname}: {e}")
|
||||
|
||||
# Set argument and return types for attach/detach functions
|
||||
try:
|
||||
# old attach/detach API
|
||||
c_lib.attach.argtypes = [ctypes.c_uint]
|
||||
except Exception as e:
|
||||
console_debug(
|
||||
"Error setting old attach/detach API argument "
|
||||
f"types: {e}, trying new API"
|
||||
)
|
||||
try:
|
||||
# new attach/detach API
|
||||
c_lib.rocattach_attach.restype = ctypes.c_int
|
||||
c_lib.rocattach_attach.argtypes = [ctypes.c_int]
|
||||
c_lib.rocattach_detach.restype = ctypes.c_int
|
||||
c_lib.rocattach_detach.argtypes = [ctypes.c_int]
|
||||
except Exception as e:
|
||||
console_error(
|
||||
f"Error setting attach/detach function argument types: {e}"
|
||||
)
|
||||
|
||||
pid = options["ROCPROF_ATTACH_PID"]
|
||||
if pid is None:
|
||||
console_error("Mode of attach/detach must have setup for process ID")
|
||||
|
||||
try:
|
||||
# old attach/detach API
|
||||
c_lib.attach(int(pid))
|
||||
except Exception as e:
|
||||
console_debug(f"Error attaching with old API: {e}, trying new API")
|
||||
try:
|
||||
# new attach/detach API
|
||||
attach_status = c_lib.rocattach_attach(int(pid))
|
||||
if attach_status != 0:
|
||||
console_error(
|
||||
f"Error attaching to process {pid}, "
|
||||
f"rocattach_attach returned {attach_status}"
|
||||
)
|
||||
except Exception as e:
|
||||
console_error(f"Error attaching to process {pid}: {e}")
|
||||
|
||||
duration = os.environ.get("ROCPROF_ATTACH_DURATION", None)
|
||||
if duration is None:
|
||||
console_log(
|
||||
f"\033[93mAttach to process with ID {pid} is successful, "
|
||||
"Press Enter to detach...\033[0m"
|
||||
)
|
||||
input()
|
||||
else:
|
||||
console_log(
|
||||
f"\033[93mAttach to process with ID {pid} is successful, "
|
||||
f"detach will happen in {duration} milliseconds...\033[0m"
|
||||
)
|
||||
time.sleep(int(duration) / 1000)
|
||||
|
||||
try:
|
||||
# old attach/detach API
|
||||
c_lib.detach(int(pid))
|
||||
except Exception as e:
|
||||
console_debug(f"Error detaching with old API: {e}, trying new API")
|
||||
try:
|
||||
# new attach/detach API
|
||||
detach_status = c_lib.rocattach_detach(int(pid))
|
||||
if detach_status != 0:
|
||||
console_error(
|
||||
f"Error detaching from process {pid}, "
|
||||
f"rocattach_detach returned {detach_status}"
|
||||
)
|
||||
except Exception as e:
|
||||
console_error(f"Error detaching from process {pid}: {e}")
|
||||
|
||||
|
||||
def capture_subprocess_output(
|
||||
subprocess_args: list[str],
|
||||
new_env: Optional[dict[str, str]] = None,
|
||||
@@ -788,49 +883,7 @@ def run_prof(
|
||||
console_debug(f"rocprof sdk env vars: {new_env}")
|
||||
|
||||
if is_mode_live_attach:
|
||||
|
||||
@contextmanager
|
||||
def temporary_env(env_vars: dict[str, str]) -> Generator[None, None, None]:
|
||||
"""
|
||||
Temporarily change the environment variable of this application.
|
||||
"""
|
||||
original_env = os.environ.copy()
|
||||
os.environ.update({k: str(v) for k, v in env_vars.items()})
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
os.environ.clear()
|
||||
os.environ.update(original_env)
|
||||
|
||||
with temporary_env(new_env):
|
||||
libname = options["ROCPROF_ATTACH_LIBRARY"]
|
||||
c_lib = ctypes.CDLL(libname)
|
||||
if c_lib is None:
|
||||
console_error(f"Error opening {libname}")
|
||||
c_lib.attach.argtypes = [ctypes.c_uint]
|
||||
|
||||
pid = options["ROCPROF_ATTACH_PID"]
|
||||
if pid is None:
|
||||
console_error(
|
||||
"Mode of attach/detach must have setup for process ID"
|
||||
)
|
||||
|
||||
c_lib.attach(int(pid))
|
||||
duration = os.environ.get("ROCPROF_ATTACH_DURATION", None)
|
||||
if duration is None:
|
||||
console_log(
|
||||
f"\033[93mAttach to process with ID {pid} is successful, "
|
||||
"Press Enter to detach...\033[0m"
|
||||
)
|
||||
input()
|
||||
else:
|
||||
console_log(
|
||||
f"\033[93mAttach to process with ID {pid} is successful, "
|
||||
f"detach will happen in {duration} milliseconds...\033[0m"
|
||||
)
|
||||
time.sleep(int(duration) / 1000)
|
||||
c_lib.detach(int(pid))
|
||||
|
||||
perform_attach_detach(new_env, options)
|
||||
else:
|
||||
if app_cmd is None:
|
||||
console_error(
|
||||
|
||||
@@ -29,6 +29,7 @@ import re
|
||||
import sqlite3
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
@@ -989,19 +990,19 @@ def test_analyze_rocpd(
|
||||
Dispatch,
|
||||
Kernel,
|
||||
Metadata,
|
||||
Metric,
|
||||
MetricDefinition,
|
||||
MetricValue,
|
||||
RooflineData,
|
||||
Value,
|
||||
Workload,
|
||||
)
|
||||
|
||||
table_name_map = {
|
||||
"compute_workload": Workload,
|
||||
"compute_metric": Metric,
|
||||
"compute_metric_definition": MetricDefinition,
|
||||
"compute_roofline_data": RooflineData,
|
||||
"compute_dispatch": Dispatch,
|
||||
"compute_kernel": Kernel,
|
||||
"compute_value": Value,
|
||||
"compute_metric_value": MetricValue,
|
||||
"compute_metadata": Metadata,
|
||||
}
|
||||
|
||||
@@ -2268,6 +2269,7 @@ def test_live_attach_detach_block(binary_handler_profile_rocprof_compute):
|
||||
try:
|
||||
# Start workload
|
||||
process_workload = subprocess.Popen(config["app_hip_dynamic_shared"], env=env)
|
||||
time.sleep(5) # Give workload time to start
|
||||
|
||||
attach_detach = {
|
||||
"attach_pid": process_workload.pid,
|
||||
@@ -2316,8 +2318,9 @@ def test_live_attach_detach_block_thread_sleep(binary_handler_profile_rocprof_co
|
||||
try:
|
||||
# Start workload with sleep mode enabled
|
||||
process_workload = subprocess.Popen(
|
||||
[config["app_hip_dynamic_shared"], "--enable-sleep"], env=env
|
||||
[*config["app_hip_dynamic_shared"], "--enable-sleep"], env=env
|
||||
)
|
||||
time.sleep(5) # Give workload time to start
|
||||
|
||||
attach_detach = {
|
||||
"attach_pid": process_workload.pid,
|
||||
@@ -2358,7 +2361,7 @@ def test_live_attach_detach_block_thread_sleep(binary_handler_profile_rocprof_co
|
||||
|
||||
|
||||
@pytest.mark.live_attach_detach
|
||||
def test_live_attach_detach_singlepath_launch_stats(
|
||||
def test_live_attach_detach_singlepass_launch_stats(
|
||||
binary_handler_profile_rocprof_compute,
|
||||
):
|
||||
options = ["--set", "launch_stats"]
|
||||
@@ -2374,6 +2377,7 @@ def test_live_attach_detach_singlepath_launch_stats(
|
||||
try:
|
||||
# Start workload
|
||||
process_workload = subprocess.Popen(config["app_hip_dynamic_shared"], env=env)
|
||||
time.sleep(5) # Give workload time to start
|
||||
|
||||
attach_detach = {
|
||||
"attach_pid": process_workload.pid,
|
||||
|
||||
@@ -242,6 +242,19 @@ Full documentation for ROCprofiler-SDK is available at [rocm.docs.amd.com/projec
|
||||
## ROCprofiler-SDK 1.1.0 for ROCm release 7.2
|
||||
|
||||
### Added
|
||||
- Strix halo support for counter collection.
|
||||
- Counter collection support for `gfx1150` and `gfx1151` (Strix Halo).
|
||||
- HSA Extension API v8 support.
|
||||
- `hipStreamCopyAttributes` API implementation.
|
||||
|
||||
### Optimized
|
||||
|
||||
- Improved process attachment and updated the corresponding [documentation](https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/how-to/using-rocprofv3-process-attachment.html).
|
||||
- Improved [Quick reference guide for rocprofv3] (https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/quick_guide.html).
|
||||
- Updated installation documentation with links to the latest repository (https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/install/installation.html).
|
||||
|
||||
### Resolved issues
|
||||
- Fixed multi-GPU dimension mismatch.
|
||||
- Fixed device lock issue for dispatch counters.
|
||||
- Addressed OpenMP Tools task scheduling null pointer exception.
|
||||
- Fixed stream ID errors arising during process attachment.
|
||||
- Fixed issues arising during dynamic code object loading.
|
||||
|
||||
@@ -1004,6 +1004,11 @@ ROCPROFILER_ENUM_LABEL(ROCPROFILER_HIP_RUNTIME_API_ID_hipGetProcAddress_spt)
|
||||
#if HIP_RUNTIME_API_TABLE_STEP_VERSION >= 20
|
||||
ROCPROFILER_ENUM_LABEL(ROCPROFILER_HIP_RUNTIME_API_ID_hipKernelGetParamInfo)
|
||||
#endif
|
||||
#if HIP_RUNTIME_API_TABLE_STEP_VERSION >= 21
|
||||
ROCPROFILER_ENUM_LABEL(ROCPROFILER_HIP_RUNTIME_API_ID_hipExtDisableLogging)
|
||||
ROCPROFILER_ENUM_LABEL(ROCPROFILER_HIP_RUNTIME_API_ID_hipExtEnableLogging)
|
||||
ROCPROFILER_ENUM_LABEL(ROCPROFILER_HIP_RUNTIME_API_ID_hipExtSetLoggingParams)
|
||||
#endif
|
||||
#if HIP_RUNTIME_API_TABLE_STEP_VERSION == 0
|
||||
static_assert(ROCPROFILER_HIP_RUNTIME_API_ID_LAST == 442);
|
||||
#elif HIP_RUNTIME_API_TABLE_STEP_VERSION == 1
|
||||
@@ -1046,6 +1051,8 @@ static_assert(ROCPROFILER_HIP_RUNTIME_API_ID_LAST == 506);
|
||||
static_assert(ROCPROFILER_HIP_RUNTIME_API_ID_LAST == 507);
|
||||
#elif HIP_RUNTIME_API_TABLE_STEP_VERSION == 20
|
||||
static_assert(ROCPROFILER_HIP_RUNTIME_API_ID_LAST == 508);
|
||||
#elif HIP_RUNTIME_API_TABLE_STEP_VERSION == 21
|
||||
static_assert(ROCPROFILER_HIP_RUNTIME_API_ID_LAST == 511);
|
||||
#else
|
||||
# if !defined(ROCPROFILER_UNSAFE_NO_VERSION_CHECK) && \
|
||||
(defined(ROCPROFILER_CI) && ROCPROFILER_CI > 0)
|
||||
|
||||
@@ -3377,6 +3377,26 @@ typedef union rocprofiler_hip_api_args_t
|
||||
size_t* paramSize;
|
||||
} hipKernelGetParamInfo;
|
||||
#endif
|
||||
#if HIP_RUNTIME_API_TABLE_STEP_VERSION >= 21
|
||||
struct
|
||||
{
|
||||
// Empty struct has a size of 0 in C but size of 1 in C++.
|
||||
// Add the rocprofiler_hip_api_no_args struct to fix this
|
||||
rocprofiler_hip_api_no_args no_args;
|
||||
} hipExtDisableLogging;
|
||||
struct
|
||||
{
|
||||
// Empty struct has a size of 0 in C but size of 1 in C++.
|
||||
// Add the rocprofiler_hip_api_no_args struct to fix this
|
||||
rocprofiler_hip_api_no_args no_args;
|
||||
} hipExtEnableLogging;
|
||||
struct
|
||||
{
|
||||
size_t log_level;
|
||||
size_t log_size;
|
||||
size_t log_mask;
|
||||
} hipExtSetLoggingParams;
|
||||
#endif
|
||||
} rocprofiler_hip_api_args_t;
|
||||
|
||||
ROCPROFILER_EXTERN_C_FINI
|
||||
|
||||
@@ -575,6 +575,11 @@ typedef enum rocprofiler_hip_runtime_api_id_t // NOLINT(performance-enum-size)
|
||||
#endif
|
||||
#if HIP_RUNTIME_API_TABLE_STEP_VERSION >= 20
|
||||
ROCPROFILER_HIP_RUNTIME_API_ID_hipKernelGetParamInfo,
|
||||
#endif
|
||||
#if HIP_RUNTIME_API_TABLE_STEP_VERSION >= 21
|
||||
ROCPROFILER_HIP_RUNTIME_API_ID_hipExtDisableLogging,
|
||||
ROCPROFILER_HIP_RUNTIME_API_ID_hipExtEnableLogging,
|
||||
ROCPROFILER_HIP_RUNTIME_API_ID_hipExtSetLoggingParams,
|
||||
#endif
|
||||
ROCPROFILER_HIP_RUNTIME_API_ID_LAST,
|
||||
} rocprofiler_hip_runtime_api_id_t;
|
||||
|
||||
@@ -623,6 +623,12 @@ ROCP_SDK_ENFORCE_ABI(::HipDispatchTable, hipGetProcAddress_spt_fn, 506);
|
||||
ROCP_SDK_ENFORCE_ABI(::HipDispatchTable, hipKernelGetParamInfo_fn, 507);
|
||||
#endif
|
||||
|
||||
#if HIP_RUNTIME_API_TABLE_STEP_VERSION >= 21
|
||||
ROCP_SDK_ENFORCE_ABI(::HipDispatchTable, hipExtDisableLogging_fn, 508);
|
||||
ROCP_SDK_ENFORCE_ABI(::HipDispatchTable, hipExtEnableLogging_fn, 509);
|
||||
ROCP_SDK_ENFORCE_ABI(::HipDispatchTable, hipExtSetLoggingParams_fn, 510);
|
||||
#endif
|
||||
|
||||
#if HIP_RUNTIME_API_TABLE_STEP_VERSION == 0
|
||||
ROCP_SDK_ENFORCE_ABI_VERSIONING(::HipDispatchTable, 442)
|
||||
#elif HIP_RUNTIME_API_TABLE_STEP_VERSION == 1
|
||||
@@ -665,6 +671,8 @@ ROCP_SDK_ENFORCE_ABI_VERSIONING(::HipDispatchTable, 506)
|
||||
ROCP_SDK_ENFORCE_ABI_VERSIONING(::HipDispatchTable, 507)
|
||||
#elif HIP_RUNTIME_API_TABLE_STEP_VERSION == 20
|
||||
ROCP_SDK_ENFORCE_ABI_VERSIONING(::HipDispatchTable, 508)
|
||||
#elif HIP_RUNTIME_API_TABLE_STEP_VERSION == 21
|
||||
ROCP_SDK_ENFORCE_ABI_VERSIONING(::HipDispatchTable, 511)
|
||||
#else
|
||||
INTERNAL_CI_ROCP_SDK_ENFORCE_ABI_VERSIONING(::HipDispatchTable, 0)
|
||||
#endif
|
||||
|
||||
@@ -650,6 +650,11 @@ HIP_API_INFO_DEFINITION_V(ROCPROFILER_HIP_TABLE_ID_Runtime, ROCPROFILER_HIP_RUNT
|
||||
HIP_API_INFO_DEFINITION_V(ROCPROFILER_HIP_TABLE_ID_Runtime, ROCPROFILER_HIP_RUNTIME_API_ID_hipKernelGetParamInfo, hipKernelGetParamInfo, hipKernelGetParamInfo_fn, kernel, paramIndex, paramOffset, paramSize);
|
||||
#endif
|
||||
|
||||
#if HIP_RUNTIME_API_TABLE_STEP_VERSION >= 21
|
||||
HIP_API_INFO_DEFINITION_0(ROCPROFILER_HIP_TABLE_ID_Runtime, ROCPROFILER_HIP_RUNTIME_API_ID_hipExtDisableLogging, hipExtDisableLogging, hipExtDisableLogging_fn);
|
||||
HIP_API_INFO_DEFINITION_0(ROCPROFILER_HIP_TABLE_ID_Runtime, ROCPROFILER_HIP_RUNTIME_API_ID_hipExtEnableLogging, hipExtEnableLogging, hipExtEnableLogging_fn);
|
||||
HIP_API_INFO_DEFINITION_V(ROCPROFILER_HIP_TABLE_ID_Runtime, ROCPROFILER_HIP_RUNTIME_API_ID_hipExtSetLoggingParams, hipExtSetLoggingParams, hipExtSetLoggingParams_fn, log_level, log_size, log_mask);
|
||||
#endif
|
||||
// clang-format on
|
||||
|
||||
#else
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
# Copyright (c) Advanced Micro Devices, Inc.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
#
|
||||
# function for
|
||||
#
|
||||
@@ -142,11 +145,27 @@ function(rocprofiler_systems_causal_example_executable _NAME)
|
||||
endif()
|
||||
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES)
|
||||
install(
|
||||
TARGETS ${_NAME} ${_NAME}-rocprofsys ${_NAME}-coz
|
||||
DESTINATION bin
|
||||
COMPONENT rocprofiler-systems-examples
|
||||
OPTIONAL
|
||||
set(_TARGETS
|
||||
${_NAME}
|
||||
${_NAME}-rocprofsys
|
||||
${_NAME}-ndebug
|
||||
${_NAME}-rocprofsys-ndebug
|
||||
${_NAME}-coz
|
||||
)
|
||||
set(_EXISTING_TARGETS)
|
||||
|
||||
foreach(_TARGET IN LISTS _TARGETS)
|
||||
if(TARGET ${_TARGET})
|
||||
list(APPEND _EXISTING_TARGETS ${_TARGET})
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
if(_EXISTING_TARGETS)
|
||||
install(
|
||||
TARGETS ${_EXISTING_TARGETS}
|
||||
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
|
||||
COMPONENT rocprofiler-systems-examples
|
||||
)
|
||||
endif()
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
# Copyright (c) Advanced Micro Devices, Inc.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
|
||||
|
||||
project(rocprofiler-systems-code-coverage-example LANGUAGES CXX)
|
||||
@@ -22,7 +25,11 @@ target_link_libraries(code-coverage PRIVATE Threads::Threads)
|
||||
target_compile_options(code-coverage PRIVATE ${_FLAGS})
|
||||
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES)
|
||||
install(TARGETS code-coverage DESTINATION bin COMPONENT rocprofiler-systems-examples)
|
||||
install(
|
||||
TARGETS code-coverage
|
||||
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
|
||||
COMPONENT rocprofiler-systems-examples
|
||||
)
|
||||
endif()
|
||||
|
||||
set(PYTHON_FILES code-coverage.py)
|
||||
@@ -42,7 +49,7 @@ if(Python3_FOUND)
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES)
|
||||
install(
|
||||
PROGRAMS ${PROJECT_BINARY_DIR}/${_FILE}
|
||||
DESTINATION bin
|
||||
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
|
||||
COMPONENT rocprofiler-systems-examples
|
||||
)
|
||||
endif()
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
# Copyright (c) Advanced Micro Devices, Inc.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
|
||||
|
||||
project(rocprofiler-systems-fork LANGUAGES CXX)
|
||||
@@ -18,7 +21,13 @@ target_link_libraries(
|
||||
target_compile_options(fork-example PRIVATE ${_FLAGS})
|
||||
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES)
|
||||
install(TARGETS fork-example DESTINATION bin COMPONENT rocprofiler-systems-examples)
|
||||
if(TARGET fork-example)
|
||||
install(
|
||||
TARGETS fork-example
|
||||
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
|
||||
COMPONENT rocprofiler-systems-examples
|
||||
)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# HIP fork example (multi-process concurrency test)
|
||||
@@ -81,10 +90,10 @@ if(HIPCC_EXECUTABLE)
|
||||
rocprofiler_systems_custom_compilation(COMPILER ${HIPCC_EXECUTABLE} TARGET hipMallocConcurrencyMproc)
|
||||
endif()
|
||||
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES)
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET hipMallocConcurrencyMproc)
|
||||
install(
|
||||
TARGETS hipMallocConcurrencyMproc
|
||||
DESTINATION bin
|
||||
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
|
||||
COMPONENT rocprofiler-systems-examples
|
||||
)
|
||||
endif()
|
||||
|
||||
@@ -1,25 +1,5 @@
|
||||
################################################################################
|
||||
# Copyright (c) 2024 - 2025 Advanced Micro Devices, Inc.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in all
|
||||
# copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
# SOFTWARE.
|
||||
#
|
||||
################################################################################
|
||||
# Copyright (c) Advanced Micro Devices, Inc.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
|
||||
|
||||
@@ -157,11 +137,15 @@ if(HIP_FOUND AND rocjpeg_FOUND AND Threads_FOUND AND rocprofiler-register_FOUND)
|
||||
target_compile_options(jpegdecode PRIVATE ${_FLAGS})
|
||||
copy_image_files_and_make_copies()
|
||||
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES)
|
||||
install(TARGETS jpegdecode DESTINATION bin COMPONENT rocprofiler-systems-examples)
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET jpegdecode)
|
||||
install(
|
||||
FILES ${CMAKE_BINARY_DIR}/images
|
||||
DESTINATION share/rocprofiler-systems/tests/images
|
||||
TARGETS jpegdecode
|
||||
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
|
||||
COMPONENT rocprofiler-systems-examples
|
||||
)
|
||||
install(
|
||||
DIRECTORY ${CMAKE_BINARY_DIR}/images/
|
||||
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples/images
|
||||
COMPONENT rocprofiler-systems-examples
|
||||
)
|
||||
endif()
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
# Copyright (c) Advanced Micro Devices, Inc.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
|
||||
|
||||
project(rocprofiler-systems-lulesh-example LANGUAGES C CXX)
|
||||
@@ -82,16 +85,11 @@ rocprofiler_systems_causal_example_executable(
|
||||
INCLUDE_DIRECTORIES ${PROJECT_SOURCE_DIR}/includes
|
||||
)
|
||||
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES)
|
||||
if(LULESH_BUILD_KOKKOS)
|
||||
install(
|
||||
TARGETS kokkoscore kokkoscontainers
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES AND LULESH_BUILD_KOKKOS)
|
||||
install(
|
||||
TARGETS kokkoscore kokkoscontainers kokkossimd
|
||||
LIBRARY
|
||||
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples/lib
|
||||
COMPONENT rocprofiler-systems-examples
|
||||
)
|
||||
set_target_properties(
|
||||
lulesh
|
||||
PROPERTIES INSTALL_RPATH "\$ORIGIN/../${CMAKE_INSTALL_LIBDIR}"
|
||||
)
|
||||
endif()
|
||||
)
|
||||
endif()
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
# Copyright (c) Advanced Micro Devices, Inc.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
|
||||
|
||||
project(rocprofiler-systems-mpi-examples LANGUAGES C CXX)
|
||||
@@ -77,16 +80,24 @@ add_executable(mpi-example mpi.cpp)
|
||||
target_link_libraries(mpi-example PRIVATE mpi-cxx-interface-library)
|
||||
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES)
|
||||
install(
|
||||
TARGETS
|
||||
mpi-example
|
||||
mpi-allgather
|
||||
mpi-bcast
|
||||
mpi-all2all
|
||||
mpi-reduce
|
||||
mpi-scatter-gather
|
||||
mpi-send-recv
|
||||
DESTINATION bin
|
||||
COMPONENT rocprofiler-systems-examples
|
||||
set(MPI_EXAMPLES
|
||||
mpi-example
|
||||
mpi-allgather
|
||||
mpi-bcast
|
||||
mpi-all2all
|
||||
mpi-reduce
|
||||
mpi-scatter-gather
|
||||
mpi-send-recv
|
||||
mpi-allreduce
|
||||
)
|
||||
|
||||
foreach(MPI_EXAMPLE IN LISTS MPI_EXAMPLES)
|
||||
if(TARGET ${MPI_EXAMPLE})
|
||||
install(
|
||||
TARGETS ${MPI_EXAMPLE}
|
||||
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
|
||||
COMPONENT rocprofiler-systems-examples
|
||||
)
|
||||
endif()
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
@@ -89,13 +89,17 @@ endif()
|
||||
target_link_libraries(openmp-cg PRIVATE openmp-common)
|
||||
target_link_libraries(openmp-lu PRIVATE openmp-common)
|
||||
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES)
|
||||
install(
|
||||
TARGETS openmp-cg openmp-lu
|
||||
DESTINATION bin
|
||||
COMPONENT rocprofiler-systems-examples
|
||||
)
|
||||
endif()
|
||||
set(OPENMP_EXAMPLES openmp-cg openmp-lu)
|
||||
|
||||
foreach(OPENMP_EXAMPLE IN LISTS OPENMP_EXAMPLES)
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET ${OPENMP_EXAMPLE})
|
||||
install(
|
||||
TARGETS ${OPENMP_EXAMPLE}
|
||||
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
|
||||
COMPONENT rocprofiler-systems-examples
|
||||
)
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
set(DEFAULT_GPU_TARGETS
|
||||
"gfx900"
|
||||
|
||||
@@ -280,3 +280,19 @@ set(ROCPROFSYS_OMPVV_OFFLOAD_TESTS
|
||||
rocprofiler_systems_message(STATUS
|
||||
"Successfully configured OMPVV"
|
||||
)
|
||||
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES)
|
||||
foreach(
|
||||
test_target
|
||||
IN
|
||||
LISTS ROCPROFSYS_OMPVV_HOST_TESTS ROCPROFSYS_OMPVV_OFFLOAD_TESTS
|
||||
)
|
||||
if(TARGET "${test_target}-build")
|
||||
install(
|
||||
PROGRAMS "${OMPVV_BIN_DEST}/${test_target}"
|
||||
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
|
||||
COMPONENT rocprofiler-systems-examples
|
||||
)
|
||||
endif()
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
@@ -71,7 +71,7 @@ endif()
|
||||
# Use the directory that actually contains the library we found
|
||||
get_filename_component(_rocm_llvm_lib "${LIBOMPTARGET_SO}" DIRECTORY)
|
||||
set(_rocm_clang_lib "${ROCM_ROOT_DIR}/lib")
|
||||
set(_COMMON_RPATH "${_rocm_llvm_lib};${_rocm_clang_lib};$ORIGIN")
|
||||
set(_COMMON_RPATH "${_rocm_llvm_lib};${_rocm_clang_lib};$ORIGIN;$ORIGIN/lib")
|
||||
if(ROCmVersion_DIR)
|
||||
list(APPEND _COMMON_RPATH "${ROCmVersion_DIR}/llvm/lib")
|
||||
endif()
|
||||
@@ -126,3 +126,15 @@ rocprofiler_systems_custom_compilation(
|
||||
rocprofiler_systems_custom_compilation(TARGET openmp-target
|
||||
COMPILER ${OMP_TARGET_COMPILER}
|
||||
)
|
||||
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES)
|
||||
if(TARGET openmp-target AND TARGET openmp-target-lib)
|
||||
install(
|
||||
TARGETS openmp-target openmp-target-lib
|
||||
RUNTIME DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
|
||||
LIBRARY
|
||||
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples/lib
|
||||
COMPONENT rocprofiler-systems-examples
|
||||
)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
# Copyright (c) Advanced Micro Devices, Inc.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
|
||||
|
||||
project(rocprofiler-systems-parallel-overhead-example LANGUAGES CXX)
|
||||
@@ -32,10 +35,13 @@ target_link_libraries(
|
||||
)
|
||||
target_compile_definitions(parallel-overhead-locks PRIVATE USE_LOCKS=1)
|
||||
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES)
|
||||
install(
|
||||
TARGETS parallel-overhead parallel-overhead-locks
|
||||
DESTINATION bin
|
||||
COMPONENT rocprofiler-systems-examples
|
||||
)
|
||||
endif()
|
||||
set(PARALLEL_OVERHEAD_EXAMPLES parallel-overhead parallel-overhead-locks)
|
||||
foreach(PARALLEL_OVERHEAD_EXAMPLE IN LISTS PARALLEL_OVERHEAD_EXAMPLES)
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET ${PARALLEL_OVERHEAD_EXAMPLE})
|
||||
install(
|
||||
TARGETS ${PARALLEL_OVERHEAD_EXAMPLE}
|
||||
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
|
||||
COMPONENT rocprofiler-systems-examples
|
||||
)
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
# Copyright (c) Advanced Micro Devices, Inc.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
|
||||
|
||||
project(rocprofiler-systems-python)
|
||||
@@ -36,7 +39,7 @@ if(Python3_FOUND)
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES)
|
||||
install(
|
||||
PROGRAMS ${PROJECT_BINARY_DIR}/${_FILE}
|
||||
DESTINATION bin
|
||||
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
|
||||
COMPONENT rocprofiler-systems-examples
|
||||
)
|
||||
endif()
|
||||
|
||||
@@ -138,6 +138,14 @@ if(hip_FOUND AND rccl_FOUND)
|
||||
add_dependencies(rccl-tests::${_EXE_NAME} copy-${_EXE_NAME})
|
||||
|
||||
list(APPEND _RCCL_TEST_TARGETS "rccl-tests::${_EXE_NAME}")
|
||||
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES)
|
||||
install(
|
||||
PROGRAMS ${_EXE_DEST_PATH}
|
||||
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
|
||||
COMPONENT rocprofiler-systems-examples
|
||||
)
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
set(RCCL_TEST_TARGETS "${_RCCL_TEST_TARGETS}" CACHE INTERNAL "rccl-test targets")
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
# Copyright (c) Advanced Micro Devices, Inc.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
|
||||
|
||||
project(rocprofiler-systems-rewrite-caller-example LANGUAGES CXX)
|
||||
@@ -18,6 +21,10 @@ set(CMAKE_BUILD_TYPE "Debug")
|
||||
add_executable(rewrite-caller rewrite-caller.cpp)
|
||||
target_compile_options(rewrite-caller PRIVATE ${_FLAGS})
|
||||
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES)
|
||||
install(TARGETS rewrite-caller DESTINATION bin COMPONENT rocprofiler-systems-examples)
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET rewrite-caller)
|
||||
install(
|
||||
TARGETS rewrite-caller
|
||||
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
|
||||
COMPONENT rocprofiler-systems-examples
|
||||
)
|
||||
endif()
|
||||
|
||||
@@ -1,24 +1,5 @@
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
# Copyright (c) Advanced Micro Devices, Inc.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
|
||||
|
||||
@@ -113,6 +94,10 @@ if(NOT CMAKE_CXX_COMPILER_IS_HIPCC AND HIPCC_EXECUTABLE)
|
||||
rocprofiler_systems_custom_compilation(COMPILER ${HIPCC_EXECUTABLE} TARGET roctx)
|
||||
endif()
|
||||
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES)
|
||||
install(TARGETS roctx DESTINATION bin COMPONENT rocprofiler-systems-examples)
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET roctx)
|
||||
install(
|
||||
TARGETS roctx
|
||||
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
|
||||
COMPONENT rocprofiler-systems-examples
|
||||
)
|
||||
endif()
|
||||
|
||||
@@ -40,3 +40,11 @@ target_compile_options(tests-compile-options INTERFACE -g)
|
||||
add_executable(thread-limit thread-limit.cpp)
|
||||
target_compile_definitions(thread-limit PRIVATE MAX_THREADS=${ROCPROFSYS_MAX_THREADS})
|
||||
target_link_libraries(thread-limit PRIVATE Threads::Threads tests-compile-options)
|
||||
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET thread-limit)
|
||||
install(
|
||||
TARGETS thread-limit
|
||||
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
|
||||
COMPONENT rocprofiler-systems-examples
|
||||
)
|
||||
endif()
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
# Copyright (c) Advanced Micro Devices, Inc.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
|
||||
|
||||
project(rocprofiler-systems-trace-time-window-example LANGUAGES CXX)
|
||||
@@ -18,10 +21,10 @@ set(CMAKE_BUILD_TYPE "Debug")
|
||||
add_executable(trace-time-window trace-time-window.cpp)
|
||||
target_compile_options(trace-time-window PRIVATE ${_FLAGS})
|
||||
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES)
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET trace-time-window)
|
||||
install(
|
||||
TARGETS trace-time-window
|
||||
DESTINATION bin
|
||||
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
|
||||
COMPONENT rocprofiler-systems-examples
|
||||
)
|
||||
endif()
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
# Copyright (c) Advanced Micro Devices, Inc.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
|
||||
|
||||
project(rocprofiler-systems-transferBench-example LANGUAGES CXX)
|
||||
@@ -120,6 +123,10 @@ if(NOT CMAKE_CXX_COMPILER_IS_HIPCC AND HIPCC_EXECUTABLE)
|
||||
rocprofiler_systems_custom_compilation(COMPILER ${HIPCC_EXECUTABLE} TARGET transferBench)
|
||||
endif()
|
||||
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES)
|
||||
install(TARGETS transferBench DESTINATION bin COMPONENT rocprofiler-systems-examples)
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET transferBench)
|
||||
install(
|
||||
TARGETS transferBench
|
||||
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
|
||||
COMPONENT rocprofiler-systems-examples
|
||||
)
|
||||
endif()
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
# Copyright (c) Advanced Micro Devices, Inc.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
|
||||
|
||||
project(rocprofiler-systems-transpose-example LANGUAGES CXX)
|
||||
@@ -93,6 +96,10 @@ if(NOT CMAKE_CXX_COMPILER_IS_HIPCC AND HIPCC_EXECUTABLE)
|
||||
rocprofiler_systems_custom_compilation(COMPILER ${HIPCC_EXECUTABLE} TARGET transpose)
|
||||
endif()
|
||||
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES)
|
||||
install(TARGETS transpose DESTINATION bin COMPONENT rocprofiler-systems-examples)
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET transpose)
|
||||
install(
|
||||
TARGETS transpose
|
||||
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
|
||||
COMPONENT rocprofiler-systems-examples
|
||||
)
|
||||
endif()
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
# Copyright (c) Advanced Micro Devices, Inc.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
|
||||
|
||||
project(rocprofiler-systems-user-api-example LANGUAGES CXX)
|
||||
@@ -24,6 +27,10 @@ target_link_libraries(
|
||||
PRIVATE Threads::Threads rocprofiler-systems::rocprofiler-systems-user-library
|
||||
)
|
||||
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES)
|
||||
install(TARGETS user-api DESTINATION bin COMPONENT rocprofiler-systems-examples)
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET user-api)
|
||||
install(
|
||||
TARGETS user-api
|
||||
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
|
||||
COMPONENT rocprofiler-systems-examples
|
||||
)
|
||||
endif()
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
# Copyright (c) Advanced Micro Devices, Inc.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
|
||||
|
||||
project(rocprofiler-systems-videodecode-example LANGUAGES CXX)
|
||||
@@ -176,15 +179,15 @@ if(FFMPEG_FOUND AND rocdecode_FOUND)
|
||||
target_compile_definitions(videodecode PUBLIC USE_AVCODEC_GREATER_THAN_58_134=1)
|
||||
endif()
|
||||
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES)
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET videodecode)
|
||||
install(
|
||||
TARGETS videodecode
|
||||
DESTINATION bin
|
||||
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
|
||||
COMPONENT rocprofiler-systems-examples
|
||||
)
|
||||
install(
|
||||
FILES ${CMAKE_BINARY_DIR}/videos
|
||||
DESTINATION share/rocprofiler-systems/tests/videos
|
||||
DIRECTORY ${CMAKE_BINARY_DIR}/videos/
|
||||
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples/videos
|
||||
COMPONENT rocprofiler-systems-examples
|
||||
)
|
||||
endif()
|
||||
|
||||
@@ -192,6 +192,10 @@ prepare_environment_for_run(parser_data_t& _data)
|
||||
rocprofsys::argparse::add_ld_preload(_data);
|
||||
rocprofsys::argparse::add_ld_library_path(_data);
|
||||
}
|
||||
|
||||
rocprofsys::argparse::add_torch_library_path(_data, _data.verbose > 0);
|
||||
|
||||
rocprofsys::common::consolidate_env_entries(_data.current);
|
||||
}
|
||||
|
||||
void
|
||||
|
||||
@@ -933,3 +933,9 @@ parse_args(int argc, char** argv, std::vector<char*>& _env)
|
||||
|
||||
return _outv;
|
||||
}
|
||||
|
||||
void
|
||||
add_torch_library_path(std::vector<char*>& envp, const std::vector<char*>& argv)
|
||||
{
|
||||
rocprofsys::common::add_torch_library_path(envp, argv, verbose > 0, updated_envs);
|
||||
}
|
||||
|
||||
@@ -51,6 +51,8 @@ main(int argc, char** argv)
|
||||
_argv.emplace_back(argv[i]);
|
||||
}
|
||||
|
||||
add_torch_library_path(_env, _argv);
|
||||
|
||||
print_updated_environment(_env);
|
||||
|
||||
if(!_argv.empty())
|
||||
|
||||
@@ -35,3 +35,6 @@ get_initial_environment();
|
||||
|
||||
std::vector<char*>
|
||||
parse_args(int argc, char** argv, std::vector<char*>& envp);
|
||||
|
||||
void
|
||||
add_torch_library_path(std::vector<char*>& envp, const std::vector<char*>& argv);
|
||||
|
||||
@@ -26,9 +26,12 @@
|
||||
|
||||
#include "common/join.hpp"
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <numeric>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
@@ -197,7 +200,7 @@ remove_env(std::vector<char*>& _environ, std::string_view _env_var,
|
||||
{
|
||||
if(match(itr))
|
||||
{
|
||||
free(itr);
|
||||
std::free(itr);
|
||||
itr = nullptr;
|
||||
}
|
||||
}
|
||||
@@ -266,6 +269,113 @@ discover_llvm_libdir_for_ompt(bool verbose = false)
|
||||
return {};
|
||||
}
|
||||
|
||||
inline bool
|
||||
is_python_interpreter(std::string_view executable)
|
||||
{
|
||||
if(executable.empty()) return false;
|
||||
|
||||
const auto slash_pos = executable.rfind('/');
|
||||
const auto basename = (slash_pos != std::string_view::npos)
|
||||
? executable.substr(slash_pos + 1)
|
||||
: executable;
|
||||
|
||||
if(basename == "python" || basename == "python3") return true;
|
||||
|
||||
constexpr std::string_view python3_prefix = "python3.";
|
||||
|
||||
const bool has_valid_prefix =
|
||||
basename.size() > python3_prefix.size() &&
|
||||
basename.substr(0, python3_prefix.size()) == python3_prefix;
|
||||
if(!has_valid_prefix) return false;
|
||||
|
||||
const auto version_digits = basename.substr(python3_prefix.size());
|
||||
|
||||
return std::all_of(version_digits.begin(), version_digits.end(),
|
||||
[](unsigned char c) { return std::isdigit(c); });
|
||||
}
|
||||
|
||||
inline std::string
|
||||
discover_torch_libpath(const std::string& python_binary, bool verbose = false)
|
||||
{
|
||||
if(python_binary.empty()) return {};
|
||||
|
||||
const auto is_safe_executable_path = [](const std::string& path) {
|
||||
// Allow only a conservative set of characters in the executable path to
|
||||
// avoid injection when used in a shell command.
|
||||
for(unsigned char c : path)
|
||||
{
|
||||
if(std::isalnum(c) != 0) continue;
|
||||
switch(c)
|
||||
{
|
||||
case '/':
|
||||
case '.':
|
||||
case '_':
|
||||
case '-':
|
||||
case '+': break;
|
||||
default: return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
if(!is_safe_executable_path(python_binary))
|
||||
{
|
||||
ROCPROFSYS_ENVIRON_LOG(
|
||||
verbose, "Unsafe characters detected in Python interpreter path: %s\n",
|
||||
python_binary.c_str());
|
||||
return {};
|
||||
}
|
||||
|
||||
const auto cmd = "\"" + python_binary +
|
||||
"\" -c \"import torch; print(torch.__path__[0])\" 2>/dev/null";
|
||||
|
||||
FILE* pipe = popen(cmd.c_str(), "r");
|
||||
if(!pipe)
|
||||
{
|
||||
ROCPROFSYS_ENVIRON_LOG(verbose, "Failed to execute command: %s\n", cmd.c_str());
|
||||
return {};
|
||||
}
|
||||
|
||||
char buffer[1024];
|
||||
std::string result;
|
||||
while(fgets(buffer, sizeof(buffer), pipe))
|
||||
{
|
||||
result.append(buffer);
|
||||
// stop if we've read the full line (torch path is printed on a single line)
|
||||
if(!result.empty() && result.back() == '\n') break;
|
||||
}
|
||||
|
||||
int status = pclose(pipe);
|
||||
|
||||
if(status != 0 || result.empty())
|
||||
{
|
||||
ROCPROFSYS_ENVIRON_LOG(verbose, "torch not found for Python interpreter: %s\n",
|
||||
python_binary.c_str());
|
||||
return {};
|
||||
}
|
||||
|
||||
while(!result.empty() &&
|
||||
(result.back() == '\n' || result.back() == '\r' || result.back() == ' '))
|
||||
{
|
||||
result.pop_back();
|
||||
}
|
||||
|
||||
if(result.empty()) return {};
|
||||
|
||||
std::string torch_libdir = result + "/lib";
|
||||
|
||||
if(!::tim::filepath::direxists(torch_libdir))
|
||||
{
|
||||
ROCPROFSYS_ENVIRON_LOG(verbose, "torch lib directory does not exist: %s\n",
|
||||
torch_libdir.c_str());
|
||||
return {};
|
||||
}
|
||||
|
||||
ROCPROFSYS_ENVIRON_LOG(verbose, "Discovered torch library path: %s\n",
|
||||
torch_libdir.c_str());
|
||||
return torch_libdir;
|
||||
}
|
||||
|
||||
enum class update_mode : uint8_t
|
||||
{
|
||||
REPLACE = 0,
|
||||
@@ -335,7 +445,7 @@ update_env(std::vector<char*>& _environ, std::string_view _env_var, Tp&& _env_va
|
||||
}
|
||||
else
|
||||
{
|
||||
free(itr);
|
||||
std::free(itr);
|
||||
itr = strdup(join('=', _env_var, _env_val_str).c_str());
|
||||
}
|
||||
return;
|
||||
@@ -343,5 +453,145 @@ update_env(std::vector<char*>& _environ, std::string_view _env_var, Tp&& _env_va
|
||||
_environ.emplace_back(strdup(join('=', _env_var, _env_val_str).c_str()));
|
||||
}
|
||||
|
||||
template <typename UpdatedEnvsT>
|
||||
inline void
|
||||
add_torch_library_path(std::vector<char*>& envp, const std::vector<char*>& argv,
|
||||
bool verbose, UpdatedEnvsT& updated_envs)
|
||||
{
|
||||
if(argv.empty() || argv.front() == nullptr) return;
|
||||
if(!is_python_interpreter(argv.front())) return;
|
||||
|
||||
auto torch_libpath = discover_torch_libpath(argv.front(), verbose);
|
||||
if(torch_libpath.empty()) return;
|
||||
|
||||
std::unordered_set<std::string> seen{ torch_libpath };
|
||||
std::string result = torch_libpath;
|
||||
|
||||
constexpr std::string_view ld_prefix = "LD_LIBRARY_PATH=";
|
||||
|
||||
auto is_ld_path = [&](char* entry) {
|
||||
return entry &&
|
||||
std::string_view{ entry }.substr(0, ld_prefix.length()) == ld_prefix;
|
||||
};
|
||||
|
||||
for(auto& entry : envp)
|
||||
{
|
||||
if(!is_ld_path(entry)) continue;
|
||||
|
||||
std::istringstream stream{ std::string{ entry + ld_prefix.length() } };
|
||||
for(std::string path; std::getline(stream, path, ':');)
|
||||
{
|
||||
if(!path.empty() && seen.insert(path).second) result += ":" + path;
|
||||
}
|
||||
|
||||
std::free(entry);
|
||||
entry = nullptr;
|
||||
}
|
||||
|
||||
envp.erase(std::remove(envp.begin(), envp.end(), nullptr), envp.end());
|
||||
envp.emplace_back(strdup(join("", ld_prefix, result).c_str()));
|
||||
|
||||
updated_envs.emplace(ld_prefix.substr(0, ld_prefix.length() - 1));
|
||||
}
|
||||
|
||||
inline void
|
||||
consolidate_env_entries(std::vector<char*>& envp)
|
||||
{
|
||||
constexpr char delim = ':';
|
||||
|
||||
struct key_data
|
||||
{
|
||||
std::vector<std::string> parts;
|
||||
std::unordered_set<std::string> seen;
|
||||
|
||||
void add_unique(std::string part)
|
||||
{
|
||||
if(!part.empty() && seen.insert(part).second)
|
||||
parts.emplace_back(std::move(part));
|
||||
}
|
||||
};
|
||||
|
||||
auto parse_entry = [](std::string_view entry)
|
||||
-> std::optional<std::pair<std::string_view, std::string_view>> {
|
||||
auto eq_pos = entry.find('=');
|
||||
if(eq_pos == std::string_view::npos) return std::nullopt;
|
||||
return std::make_pair(entry.substr(0, eq_pos), entry.substr(eq_pos + 1));
|
||||
};
|
||||
|
||||
auto join_parts = [delim](std::string_view key,
|
||||
const std::vector<std::string>& parts) {
|
||||
std::string result;
|
||||
|
||||
const auto total_parts_length = std::accumulate(
|
||||
parts.begin(), parts.end(), std::size_t{ 0 },
|
||||
[](std::size_t acc, const std::string& part) { return acc + part.size(); });
|
||||
|
||||
const auto delim_count = parts.size() - 1;
|
||||
const auto equal_sign_length = 1;
|
||||
|
||||
result.reserve(key.size() + equal_sign_length + total_parts_length + delim_count);
|
||||
result.append(key);
|
||||
result += '=';
|
||||
|
||||
result =
|
||||
std::accumulate(parts.begin(), parts.end(), std::move(result),
|
||||
[delim, &parts](std::string acc, const std::string& part) {
|
||||
if(part != parts.front()) acc += delim;
|
||||
acc.append(part);
|
||||
return acc;
|
||||
});
|
||||
|
||||
return result;
|
||||
};
|
||||
|
||||
std::unordered_map<std::string_view, key_data> key_map;
|
||||
std::vector<std::string_view> key_order;
|
||||
|
||||
for(auto* entry : envp)
|
||||
{
|
||||
if(!entry)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
auto parsed = parse_entry(entry);
|
||||
if(!parsed)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
auto [key, value] = *parsed;
|
||||
|
||||
auto [it, inserted] = key_map.try_emplace(key);
|
||||
if(inserted)
|
||||
{
|
||||
key_order.emplace_back(key);
|
||||
}
|
||||
|
||||
auto& data = it->second;
|
||||
std::istringstream stream{ std::string{ value } };
|
||||
for(std::string part; std::getline(stream, part, delim);)
|
||||
{
|
||||
data.add_unique(part);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<char*> result;
|
||||
result.reserve(key_order.size());
|
||||
|
||||
for(auto key : key_order)
|
||||
{
|
||||
result.emplace_back(strdup(join_parts(key, key_map[key].parts).c_str()));
|
||||
}
|
||||
|
||||
for(auto* entry : envp)
|
||||
{
|
||||
std::free(entry);
|
||||
entry = nullptr;
|
||||
}
|
||||
|
||||
envp = std::move(result);
|
||||
}
|
||||
|
||||
} // namespace common
|
||||
} // namespace rocprofsys
|
||||
|
||||
@@ -24,6 +24,7 @@ add_library(
|
||||
lib-common-tests
|
||||
OBJECT
|
||||
test_discover_llvm_libdir.cpp
|
||||
test_environment.cpp
|
||||
test_path.cpp
|
||||
test_remove_env.cpp
|
||||
test_update_env.cpp
|
||||
|
||||
@@ -0,0 +1,146 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include "common/environment.hpp"
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
using namespace rocprofsys::common;
|
||||
|
||||
class IsPythonInterpreterTest : public ::testing::Test
|
||||
{};
|
||||
|
||||
TEST_F(IsPythonInterpreterTest, RecognizesPython)
|
||||
{
|
||||
EXPECT_TRUE(is_python_interpreter("python"));
|
||||
EXPECT_TRUE(is_python_interpreter("python3"));
|
||||
EXPECT_TRUE(is_python_interpreter("python3.8"));
|
||||
EXPECT_TRUE(is_python_interpreter("python3.9"));
|
||||
EXPECT_TRUE(is_python_interpreter("python3.10"));
|
||||
EXPECT_TRUE(is_python_interpreter("python3.11"));
|
||||
EXPECT_TRUE(is_python_interpreter("python3.12"));
|
||||
EXPECT_TRUE(is_python_interpreter("/usr/bin/python"));
|
||||
EXPECT_TRUE(is_python_interpreter("/usr/bin/python3"));
|
||||
EXPECT_TRUE(is_python_interpreter("/usr/bin/python3.10"));
|
||||
EXPECT_TRUE(is_python_interpreter("/home/user/venv/bin/python"));
|
||||
EXPECT_TRUE(is_python_interpreter("/opt/conda/bin/python3.11"));
|
||||
EXPECT_FALSE(is_python_interpreter("bash"));
|
||||
EXPECT_FALSE(is_python_interpreter("sh"));
|
||||
EXPECT_FALSE(is_python_interpreter("ruby"));
|
||||
EXPECT_FALSE(is_python_interpreter("node"));
|
||||
EXPECT_FALSE(is_python_interpreter("java"));
|
||||
EXPECT_FALSE(is_python_interpreter("/usr/bin/bash"));
|
||||
EXPECT_FALSE(is_python_interpreter("./my_app"));
|
||||
EXPECT_FALSE(is_python_interpreter("pythonista"));
|
||||
EXPECT_FALSE(is_python_interpreter("python_script.py"));
|
||||
EXPECT_FALSE(is_python_interpreter("mypython"));
|
||||
EXPECT_FALSE(is_python_interpreter("python2"));
|
||||
EXPECT_FALSE(is_python_interpreter("python3."));
|
||||
EXPECT_FALSE(is_python_interpreter("python3.a"));
|
||||
EXPECT_FALSE(is_python_interpreter("python3.10a"));
|
||||
EXPECT_FALSE(is_python_interpreter("python3x10"));
|
||||
EXPECT_FALSE(is_python_interpreter(""));
|
||||
EXPECT_FALSE(is_python_interpreter("/usr/bin/"));
|
||||
}
|
||||
|
||||
class DuplicatedEnvironmentEntriesTest : public ::testing::Test
|
||||
{};
|
||||
|
||||
TEST_F(DuplicatedEnvironmentEntriesTest, DuplicateEnvironmentEntries)
|
||||
{
|
||||
std::vector<char*> env_vars = {
|
||||
strdup("PATH=/usr/local/bin:/usr/bin:/bin:/usr/local/bin2"),
|
||||
strdup("PATH=/usr/local/bin:/usr/bin:/bin"),
|
||||
};
|
||||
|
||||
consolidate_env_entries(env_vars);
|
||||
|
||||
ASSERT_EQ(env_vars.size(), 1);
|
||||
EXPECT_STREQ(env_vars[0], "PATH=/usr/local/bin:/usr/bin:/bin:/usr/local/bin2");
|
||||
|
||||
for(auto* entry : env_vars)
|
||||
free(entry);
|
||||
}
|
||||
|
||||
TEST_F(DuplicatedEnvironmentEntriesTest, HandlesEmptyVector)
|
||||
{
|
||||
std::vector<char*> env_vars;
|
||||
consolidate_env_entries(env_vars);
|
||||
EXPECT_TRUE(env_vars.empty());
|
||||
}
|
||||
|
||||
TEST_F(DuplicatedEnvironmentEntriesTest, HandlesNullEntries)
|
||||
{
|
||||
std::vector<char*> env_vars = {
|
||||
strdup("PATH=/usr/bin"),
|
||||
nullptr,
|
||||
strdup("PATH=/bin"),
|
||||
};
|
||||
consolidate_env_entries(env_vars);
|
||||
ASSERT_EQ(env_vars.size(), 1);
|
||||
EXPECT_STREQ(env_vars[0], "PATH=/usr/bin:/bin");
|
||||
for(auto* entry : env_vars)
|
||||
std::free(entry);
|
||||
}
|
||||
|
||||
TEST_F(DuplicatedEnvironmentEntriesTest, HandlesEmptyValues)
|
||||
{
|
||||
std::vector<char*> env_vars = {
|
||||
strdup("EMPTY_VAR="),
|
||||
strdup("PATH=/usr/bin"),
|
||||
};
|
||||
consolidate_env_entries(env_vars);
|
||||
ASSERT_EQ(env_vars.size(), 2);
|
||||
|
||||
for(auto* entry : env_vars)
|
||||
std::free(entry);
|
||||
}
|
||||
|
||||
class AddTorchLibraryPathTest : public ::testing::Test
|
||||
{
|
||||
protected:
|
||||
std::unordered_set<std::string> updated_envs;
|
||||
};
|
||||
|
||||
TEST_F(AddTorchLibraryPathTest, SkipsNonPythonExecutables)
|
||||
{
|
||||
std::vector<char*> envp = {
|
||||
strdup("LD_LIBRARY_PATH=/usr/lib"),
|
||||
};
|
||||
std::vector<char*> argv = {
|
||||
strdup("/usr/bin/bash"),
|
||||
};
|
||||
add_torch_library_path(envp, argv, false, updated_envs);
|
||||
// Should not modify environment
|
||||
ASSERT_EQ(envp.size(), 1);
|
||||
EXPECT_STREQ(envp[0], "LD_LIBRARY_PATH=/usr/lib");
|
||||
for(auto* entry : envp)
|
||||
std::free(entry);
|
||||
for(auto* entry : argv)
|
||||
std::free(entry);
|
||||
}
|
||||
|
||||
TEST_F(AddTorchLibraryPathTest, HandlesEmptyArgv)
|
||||
{
|
||||
std::vector<char*> envp = {
|
||||
strdup("LD_LIBRARY_PATH=/usr/lib"),
|
||||
};
|
||||
std::vector<char*> argv;
|
||||
add_torch_library_path(envp, argv, false, updated_envs);
|
||||
ASSERT_EQ(envp.size(), 1);
|
||||
EXPECT_STREQ(envp[0], "LD_LIBRARY_PATH=/usr/lib");
|
||||
for(auto* entry : envp)
|
||||
std::free(entry);
|
||||
}
|
||||
|
||||
TEST_F(AddTorchLibraryPathTest, HandlesNullArgvFront)
|
||||
{
|
||||
std::vector<char*> envp = {
|
||||
strdup("LD_LIBRARY_PATH=/usr/lib"),
|
||||
};
|
||||
std::vector<char*> argv = { nullptr };
|
||||
add_torch_library_path(envp, argv, false, updated_envs);
|
||||
ASSERT_EQ(envp.size(), 1);
|
||||
for(auto* entry : envp)
|
||||
std::free(entry);
|
||||
}
|
||||
@@ -168,6 +168,14 @@ add_ld_library_path(parser_data& _data)
|
||||
return _data;
|
||||
}
|
||||
|
||||
parser_data&
|
||||
add_torch_library_path(parser_data& _data, bool verbose)
|
||||
{
|
||||
rocprofsys::common::add_torch_library_path(_data.current, _data.command, verbose,
|
||||
_data.updated);
|
||||
return _data;
|
||||
}
|
||||
|
||||
parser_data&
|
||||
add_core_arguments(parser_t& _parser, parser_data& _data)
|
||||
{
|
||||
|
||||
@@ -83,6 +83,9 @@ add_ld_preload(parser_data&);
|
||||
parser_data&
|
||||
add_ld_library_path(parser_data&);
|
||||
|
||||
parser_data&
|
||||
add_torch_library_path(parser_data&, bool verbose = false);
|
||||
|
||||
parser_data&
|
||||
add_core_arguments(parser_t&, parser_data&);
|
||||
|
||||
|
||||
@@ -316,7 +316,9 @@ configure_settings(bool _init)
|
||||
"backend", "perfetto");
|
||||
|
||||
ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_TRACE_LEGACY",
|
||||
"Use legacy direct mode for perfetto tracing instead of "
|
||||
"[DEPRECATED] The new default option is to use data from "
|
||||
"cached buffer. When set to true system will use "
|
||||
"legacy direct mode for perfetto tracing instead of "
|
||||
"deferred trace generation. When false (default), uses "
|
||||
"cached mode with minimal runtime overhead.",
|
||||
false, "backend", "perfetto");
|
||||
@@ -1088,6 +1090,9 @@ configure_settings(bool _init)
|
||||
handle_deprecated_setting("ROCPROFSYS_OUTPUT_FILE", "ROCPROFSYS_PERFETTO_FILE");
|
||||
handle_deprecated_setting("ROCPROFSYS_USE_PERFETTO", "ROCPROFSYS_TRACE");
|
||||
handle_deprecated_setting("ROCPROFSYS_USE_TIMEMORY", "ROCPROFSYS_PROFILE");
|
||||
handle_deprecated_setting("ROCPROFSYS_DEBUG", "ROCPROFSYS_LOG_LEVEL");
|
||||
handle_deprecated_setting("ROCPROFSYS_VERBOSE", "ROCPROFSYS_LOG_LEVEL");
|
||||
handle_deprecated_setting("ROCPROFSYS_TRACE_LEGACY", "ROCPROFSYS_TRACE");
|
||||
|
||||
scope::get_fields()[scope::flat::value] = _config->get_flat_profile();
|
||||
scope::get_fields()[scope::timeline::value] = _config->get_timeline_profile();
|
||||
|
||||
@@ -31,6 +31,8 @@
|
||||
} \
|
||||
} // namespace ::tim::cereal
|
||||
|
||||
#include "common/defines.h"
|
||||
|
||||
#if !defined(ROCPROFSYS_USE_ROCM)
|
||||
# define ROCPROFSYS_USE_ROCM 0
|
||||
#endif
|
||||
@@ -40,7 +42,6 @@
|
||||
|
||||
#include <timemory/manager.hpp>
|
||||
|
||||
#include <dlfcn.h>
|
||||
#include <string>
|
||||
|
||||
#include "core/agent_manager.hpp"
|
||||
@@ -92,17 +93,6 @@ _amdsmi_is_initialized()
|
||||
return initialized;
|
||||
}
|
||||
|
||||
void
|
||||
prevent_amdsmi_library_unload()
|
||||
{
|
||||
static bool _initialized = false;
|
||||
if(_initialized) return;
|
||||
_initialized = true;
|
||||
|
||||
dlopen("libamd_smi.so", RTLD_NOW | RTLD_NOLOAD | RTLD_NODELETE);
|
||||
dlopen("librocm_smi64.so", RTLD_NOW | RTLD_NOLOAD | RTLD_NODELETE);
|
||||
}
|
||||
|
||||
bool
|
||||
amdsmi_init()
|
||||
{
|
||||
@@ -113,8 +103,6 @@ amdsmi_init()
|
||||
ROCPROFSYS_AMD_SMI_CALL(::amdsmi_init(AMDSMI_INIT_AMD_GPUS));
|
||||
get_processor_handles();
|
||||
_amdsmi_is_initialized() = true; // Mark as initialized
|
||||
|
||||
prevent_amdsmi_library_unload();
|
||||
} catch(std::exception& _e)
|
||||
{
|
||||
LOG_ERROR("Exception thrown initializing amd-smi: {}", _e.what());
|
||||
|
||||
@@ -367,7 +367,6 @@ config_settings(const std::shared_ptr<settings>& _config)
|
||||
|
||||
_skip_domains.emplace("kernel_dispatch");
|
||||
_skip_domains.emplace("page_migration");
|
||||
_skip_domains.emplace("scratch_memory");
|
||||
|
||||
_add_operation_settings(
|
||||
"MARKER_API", callback_tracing_info[ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API],
|
||||
@@ -652,7 +651,7 @@ get_backtrace_operations(rocprofiler_callback_tracing_kind_t kindv)
|
||||
{
|
||||
if(callback_operation_option_names.count(kindv) == 0)
|
||||
{
|
||||
LOG_CRITICAL("callback_operation_operation_names does not have value for {}",
|
||||
LOG_CRITICAL("callback_operation_option_names does not have value for {}",
|
||||
static_cast<int>(kindv));
|
||||
::rocprofsys::set_state(::rocprofsys::State::Finalized);
|
||||
std::abort();
|
||||
|
||||
@@ -41,7 +41,7 @@ using storage_parser_t =
|
||||
storage_parser<type_identifier_t, kernel_dispatch_sample, memory_copy_sample,
|
||||
memory_allocate_sample, region_sample, in_time_sample,
|
||||
pmc_event_with_sample, amd_smi_sample, cpu_freq_sample,
|
||||
backtrace_region_sample>;
|
||||
backtrace_region_sample, scratch_memory_sample>;
|
||||
|
||||
using buffer_storage_t = buffer_storage<flush_worker_factory_t, type_identifier_t>;
|
||||
|
||||
|
||||
@@ -540,6 +540,72 @@ perfetto_processor_t::handle([[maybe_unused]] const kernel_dispatch_sample& _kds
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
perfetto_processor_t::handle([[maybe_unused]] const scratch_memory_sample& _sms)
|
||||
{
|
||||
#if ROCPROFSYS_USE_ROCM > 0
|
||||
auto _corr_id = _sms.correlation_id_internal;
|
||||
auto _stream_id = _sms.stream_handle;
|
||||
auto _queue_id_handle = _sms.queue_id_handle;
|
||||
const auto& _t_info = thread_info::get(_sms.thread_id, SystemTID);
|
||||
const auto _thread_id_sequent = _t_info->index_data->sequent_value;
|
||||
auto _beg_ts = _sms.start_timestamp;
|
||||
auto _end_ts = _sms.end_timestamp;
|
||||
|
||||
auto _agent_device_id =
|
||||
m_agent_manager.get_agent_by_handle(_sms.agent_id_handle).device_type_index;
|
||||
auto _name = std::string{ m_metadata.get_buffer_name_info().at(
|
||||
static_cast<rocprofiler_buffer_tracing_kind_t>(_sms.kind),
|
||||
static_cast<rocprofiler_tracing_operation_t>(_sms.operation)) };
|
||||
|
||||
// Scratch memory samples from SDK versions prior to 7.0.2 do not include
|
||||
// allocation_size field, so counter tracks are not needed
|
||||
# if ROCPROFSYS_ROCM_VERSION >= 70002
|
||||
using counter_track =
|
||||
perfetto_counter_track<rocprofiler_buffer_tracing_scratch_memory_record_t>;
|
||||
|
||||
if(!counter_track::exists(_agent_device_id))
|
||||
{
|
||||
auto _track_desc_alloc_size = JOIN("", "GPU Scratch Memory [", _agent_device_id,
|
||||
"] Thread ", _thread_id_sequent);
|
||||
counter_track::emplace(_agent_device_id, _track_desc_alloc_size, "bytes");
|
||||
}
|
||||
|
||||
if(_sms.operation == ROCPROFILER_SCRATCH_MEMORY_ALLOC)
|
||||
{
|
||||
TRACE_COUNTER("rocm_scratch_memory", counter_track::at(_agent_device_id, 0),
|
||||
_beg_ts, _sms.allocation_size);
|
||||
}
|
||||
# endif
|
||||
|
||||
auto _track_desc_events = [&]() {
|
||||
return JOIN("", "GPU Scratch Memory Events Thread ", _thread_id_sequent);
|
||||
};
|
||||
|
||||
const auto _track =
|
||||
tracing::get_perfetto_track(category::rocm_scratch_memory{}, _track_desc_events);
|
||||
|
||||
auto add_perfetto_annotations = [&](::perfetto::EventContext ctx) {
|
||||
if(!m_use_annotations) return;
|
||||
|
||||
annotate_perfetto(ctx, { { "begin_ns", _beg_ts },
|
||||
{ "end_ns", _end_ts },
|
||||
{ "corr_id", _corr_id },
|
||||
{ "stream_id", _stream_id },
|
||||
{ "queue", _queue_id_handle },
|
||||
{ "allocation_size", _sms.allocation_size },
|
||||
{ "agent_id", _agent_device_id },
|
||||
{ "operation", _name },
|
||||
{ "flags", _sms.flags } });
|
||||
};
|
||||
|
||||
tracing::push_perfetto(category::rocm_scratch_memory{}, _name.c_str(), _track,
|
||||
_beg_ts, ::perfetto::Flow::ProcessScoped(_corr_id),
|
||||
add_perfetto_annotations);
|
||||
tracing::pop_perfetto(category::rocm_scratch_memory{}, "", _track, _end_ts);
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
perfetto_processor_t::handle([[maybe_unused]] const memory_copy_sample& _mcs)
|
||||
{
|
||||
|
||||
@@ -56,6 +56,7 @@ public:
|
||||
void finalize_processing();
|
||||
|
||||
void handle(const kernel_dispatch_sample& sample);
|
||||
void handle(const scratch_memory_sample& sample);
|
||||
void handle(const memory_copy_sample& sample);
|
||||
void handle(const memory_allocate_sample& sample);
|
||||
void handle(const region_sample& sample);
|
||||
|
||||
@@ -66,6 +66,37 @@ get_handle_from_code_object(
|
||||
# endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#if ROCPROFSYS_USE_ROCM > 0
|
||||
using memory_operation = std::string;
|
||||
using memory_type = std::string;
|
||||
std::pair<memory_operation, memory_type>
|
||||
parse_memory_operation_name(std::string_view memory_operation_name)
|
||||
{
|
||||
static const std::unordered_map<std::string_view,
|
||||
std::pair<memory_operation, memory_type>>
|
||||
parsing_map{
|
||||
{ "MEMORY_ALLOCATION_NONE", { "NONE", "REAL" } },
|
||||
{ "MEMORY_ALLOCATION_ALLOCATE", { "ALLOC", "REAL" } },
|
||||
{ "MEMORY_ALLOCATION_VMEM_ALLOCATE", { "ALLOC", "VIRTUAL" } },
|
||||
{ "MEMORY_ALLOCATION_FREE", { "FREE", "REAL" } },
|
||||
{ "MEMORY_ALLOCATION_VMEM_FREE", { "FREE", "VIRTUAL" } },
|
||||
{ "SCRATCH_MEMORY_NONE", { "NONE", "SCRATCH" } },
|
||||
{ "SCRATCH_MEMORY_ALLOC", { "ALLOC", "SCRATCH" } },
|
||||
{ "SCRATCH_MEMORY_FREE", { "FREE", "SCRATCH" } },
|
||||
{ "SCRATCH_MEMORY_ASYNC_RECLAIM", { "ASYNC_RECLAIM", "SCRATCH" } },
|
||||
};
|
||||
|
||||
auto item = parsing_map.find(memory_operation_name);
|
||||
if(item == parsing_map.end())
|
||||
{
|
||||
LOG_WARNING("Unknown memory operation name: {}", memory_operation_name);
|
||||
return { "UNKNOWN", "UNKNOWN" };
|
||||
}
|
||||
|
||||
return item->second;
|
||||
}
|
||||
#endif
|
||||
} // namespace
|
||||
|
||||
void
|
||||
@@ -110,6 +141,46 @@ rocpd_processor_t::handle([[maybe_unused]] const kernel_dispatch_sample& _kds)
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
rocpd_processor_t::handle([[maybe_unused]] const scratch_memory_sample& _sms)
|
||||
{
|
||||
#if ROCPROFSYS_USE_ROCM > 0
|
||||
auto& n_info = node_info::get_instance();
|
||||
auto process = m_metadata->get_process_info();
|
||||
|
||||
const auto* _name = m_metadata->get_buffer_name_info().at(
|
||||
static_cast<rocprofiler_buffer_tracing_kind_t>(_sms.kind),
|
||||
static_cast<rocprofiler_tracing_operation_t>(_sms.operation));
|
||||
|
||||
auto agent_primary_key =
|
||||
m_agent_manager->get_agent_by_handle(_sms.agent_id_handle).base_id;
|
||||
|
||||
auto thread_primary_key =
|
||||
m_data_processor->map_thread_id_to_primary_key(_sms.thread_id);
|
||||
|
||||
auto category_primary_key = m_data_processor->insert_string(
|
||||
trait::name<category::rocm_scratch_memory>::value);
|
||||
|
||||
auto stack_id = _sms.correlation_id_internal;
|
||||
auto parent_stack_id = _sms.correlation_id_ancestor;
|
||||
auto correlation_id = 0;
|
||||
auto address_value = 0;
|
||||
|
||||
auto event_primary_key = m_data_processor->insert_event(
|
||||
category_primary_key, stack_id, parent_stack_id, correlation_id);
|
||||
|
||||
auto [memory_operation, memory_type] = parse_memory_operation_name(_name);
|
||||
|
||||
auto extdata_json_str = JOIN("", "{\"flags\": ", _sms.flags, "}");
|
||||
|
||||
m_data_processor->insert_memory_alloc(
|
||||
n_info.id, process.pid, thread_primary_key, agent_primary_key,
|
||||
memory_operation.c_str(), memory_type.c_str(), _sms.start_timestamp,
|
||||
_sms.end_timestamp, address_value, _sms.allocation_size, _sms.queue_id_handle,
|
||||
_sms.stream_handle, event_primary_key, extdata_json_str.c_str());
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
rocpd_processor_t::handle([[maybe_unused]] const memory_copy_sample& _mcs)
|
||||
{
|
||||
@@ -153,46 +224,6 @@ void
|
||||
rocpd_processor_t::handle([[maybe_unused]] const memory_allocate_sample& _mas)
|
||||
{
|
||||
#if ROCPROFSYS_USE_ROCM > 0 && (ROCPROFILER_VERSION >= 600)
|
||||
static auto memtype_to_db =
|
||||
[](std::string_view memory_type) -> std::pair<std::string, std::string> {
|
||||
constexpr auto MEMORY_PREFIX = std::string_view{ "MEMORY_ALLOCATION_" };
|
||||
constexpr auto SCRATCH_PREFIX = std::string_view{ "SCRATCH_MEMORY_" };
|
||||
constexpr auto VMEM_PREFIX = std::string_view{ "VMEM_" };
|
||||
constexpr auto ASYNC_PREFIX = std::string_view{ "ASYNC_" };
|
||||
|
||||
std::string _type;
|
||||
std::string _level;
|
||||
if(memory_type.find(MEMORY_PREFIX) == 0)
|
||||
{
|
||||
_type = memory_type.substr(MEMORY_PREFIX.length());
|
||||
if(_type.find(VMEM_PREFIX) == 0)
|
||||
{
|
||||
_type = _type.substr(VMEM_PREFIX.length());
|
||||
_level = "VIRTUAL";
|
||||
}
|
||||
else
|
||||
{
|
||||
_level = "REAL";
|
||||
}
|
||||
}
|
||||
else if(memory_type.find(SCRATCH_PREFIX) == 0)
|
||||
{
|
||||
_type = memory_type.substr(SCRATCH_PREFIX.length());
|
||||
_level = "SCRATCH";
|
||||
if(memory_type.find(ASYNC_PREFIX) == 0)
|
||||
{
|
||||
_type = memory_type.substr(ASYNC_PREFIX.length()); // RECLAIM
|
||||
}
|
||||
}
|
||||
|
||||
if(_type == "ALLOCATE")
|
||||
{
|
||||
_type = "ALLOC";
|
||||
}
|
||||
|
||||
return std::make_pair(_type, _level);
|
||||
};
|
||||
|
||||
auto& n_info = node_info::get_instance();
|
||||
auto process = m_metadata->get_process_info();
|
||||
auto thread_primary_key =
|
||||
@@ -210,7 +241,7 @@ rocpd_processor_t::handle([[maybe_unused]] const memory_allocate_sample& _mas)
|
||||
static_cast<rocprofiler_buffer_tracing_kind_t>(_mas.kind),
|
||||
static_cast<rocprofiler_tracing_operation_t>(_mas.operation));
|
||||
|
||||
auto [type, level] = memtype_to_db(_name);
|
||||
auto [memory_operation, memory_type] = parse_memory_operation_name(_name);
|
||||
|
||||
auto stack_id = _mas.correlation_id_internal;
|
||||
auto parent_stack_id = _mas.correlation_id_ancestor;
|
||||
@@ -224,9 +255,10 @@ rocpd_processor_t::handle([[maybe_unused]] const memory_allocate_sample& _mas)
|
||||
category_primary_key, stack_id, parent_stack_id, correlation_id);
|
||||
|
||||
m_data_processor->insert_memory_alloc(
|
||||
n_info.id, process.pid, thread_primary_key, agent_primary_key, type.c_str(),
|
||||
level.c_str(), _mas.start_timestamp, _mas.end_timestamp, _mas.address_value,
|
||||
_mas.allocation_size, queue_id, _mas.stream_handle, event_primary_key);
|
||||
n_info.id, process.pid, thread_primary_key, agent_primary_key,
|
||||
memory_operation.c_str(), memory_type.c_str(), _mas.start_timestamp,
|
||||
_mas.end_timestamp, _mas.address_value, _mas.allocation_size, queue_id,
|
||||
_mas.stream_handle, event_primary_key);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -45,6 +45,7 @@ public:
|
||||
void finalize_processing();
|
||||
|
||||
void handle(const kernel_dispatch_sample& sample);
|
||||
void handle(const scratch_memory_sample& sample);
|
||||
void handle(const memory_copy_sample& sample);
|
||||
void handle(const memory_allocate_sample& sample);
|
||||
void handle(const region_sample& sample);
|
||||
|
||||
@@ -43,6 +43,11 @@ struct processor_t
|
||||
static_cast<T*>(this)->handle(sample);
|
||||
}
|
||||
|
||||
void handle(const scratch_memory_sample& sample)
|
||||
{
|
||||
static_cast<T*>(this)->handle(sample);
|
||||
}
|
||||
|
||||
void handle(const memory_copy_sample& sample)
|
||||
{
|
||||
static_cast<T*>(this)->handle(sample);
|
||||
@@ -84,6 +89,7 @@ protected:
|
||||
struct processor_view_t
|
||||
{
|
||||
using kernel_dispatch_fn_t = void (*)(void*, const kernel_dispatch_sample&) noexcept;
|
||||
using scratch_memory_fn_t = void (*)(void*, const scratch_memory_sample&) noexcept;
|
||||
using memory_copy_fn_t = void (*)(void*, const memory_copy_sample&) noexcept;
|
||||
#if(ROCPROFILER_VERSION >= 600)
|
||||
using memory_allocate_fn_t = void (*)(void*, const memory_allocate_sample&) noexcept;
|
||||
@@ -101,6 +107,7 @@ struct processor_view_t
|
||||
struct vtable_t
|
||||
{
|
||||
kernel_dispatch_fn_t handle_kernel_dispatch;
|
||||
scratch_memory_fn_t handle_scratch_memory;
|
||||
memory_copy_fn_t handle_memory_copy;
|
||||
#if(ROCPROFILER_VERSION >= 600)
|
||||
memory_allocate_fn_t handle_memory_allocate;
|
||||
@@ -134,6 +141,11 @@ struct processor_view_t
|
||||
m_vtable->handle_kernel_dispatch(m_object, sample);
|
||||
}
|
||||
|
||||
ROCPROFSYS_INLINE void handle(const scratch_memory_sample& sample) const noexcept
|
||||
{
|
||||
m_vtable->handle_scratch_memory(m_object, sample);
|
||||
}
|
||||
|
||||
ROCPROFSYS_INLINE void handle(const memory_copy_sample& sample) const noexcept
|
||||
{
|
||||
m_vtable->handle_memory_copy(m_object, sample);
|
||||
@@ -194,6 +206,9 @@ private:
|
||||
+[](void* obj, const kernel_dispatch_sample& sample) noexcept {
|
||||
static_cast<T*>(obj)->handle(sample);
|
||||
},
|
||||
+[](void* obj, const scratch_memory_sample& sample) noexcept {
|
||||
static_cast<T*>(obj)->handle(sample);
|
||||
},
|
||||
+[](void* obj, const memory_copy_sample& sample) noexcept {
|
||||
static_cast<T*>(obj)->handle(sample);
|
||||
},
|
||||
@@ -275,6 +290,9 @@ struct sample_processor_t
|
||||
case type_identifier_t::kernel_dispatch:
|
||||
handle_sample(static_cast<const kernel_dispatch_sample&>(sample));
|
||||
break;
|
||||
case type_identifier_t::scratch_memory:
|
||||
handle_sample(static_cast<const scratch_memory_sample&>(sample));
|
||||
break;
|
||||
case type_identifier_t::memory_copy:
|
||||
handle_sample(static_cast<const memory_copy_sample&>(sample));
|
||||
break;
|
||||
|
||||
@@ -46,6 +46,7 @@ enum class type_identifier_t : uint32_t
|
||||
amd_smi_sample = 0x0006,
|
||||
cpu_freq_sample = 0x0007,
|
||||
backtrace_region_sample = 0x0008,
|
||||
scratch_memory = 0x0009,
|
||||
fragmented_space = 0xFFFF
|
||||
};
|
||||
|
||||
@@ -148,6 +149,83 @@ get_size(const kernel_dispatch_sample& item)
|
||||
item.grid_size_z, static_cast<uint64_t>(item.stream_handle));
|
||||
}
|
||||
|
||||
struct scratch_memory_sample : cacheable_t
|
||||
{
|
||||
static constexpr type_identifier_t type_identifier =
|
||||
type_identifier_t::scratch_memory;
|
||||
|
||||
scratch_memory_sample() = default;
|
||||
scratch_memory_sample(uint64_t _start_timestamp, uint64_t _end_timestamp,
|
||||
uint64_t _thread_id, uint64_t _agent_id_handle,
|
||||
uint64_t _queue_id_handle, int32_t _kind, int32_t _operation,
|
||||
int32_t _flags, uint64_t _allocation_size,
|
||||
uint64_t _correlation_id_internal,
|
||||
uint64_t _correlation_id_ancestor, size_t _stream_handle)
|
||||
: start_timestamp(_start_timestamp)
|
||||
, end_timestamp(_end_timestamp)
|
||||
, thread_id(_thread_id)
|
||||
, agent_id_handle(_agent_id_handle)
|
||||
, queue_id_handle(_queue_id_handle)
|
||||
, kind(_kind)
|
||||
, operation(_operation)
|
||||
, flags(_flags)
|
||||
, allocation_size(_allocation_size)
|
||||
, correlation_id_internal(_correlation_id_internal)
|
||||
, correlation_id_ancestor(_correlation_id_ancestor)
|
||||
, stream_handle(_stream_handle)
|
||||
{}
|
||||
|
||||
uint64_t start_timestamp;
|
||||
uint64_t end_timestamp;
|
||||
uint64_t thread_id;
|
||||
uint64_t agent_id_handle;
|
||||
uint64_t queue_id_handle;
|
||||
int32_t kind;
|
||||
int32_t operation;
|
||||
int32_t flags;
|
||||
uint64_t allocation_size;
|
||||
uint64_t correlation_id_internal;
|
||||
uint64_t correlation_id_ancestor;
|
||||
size_t stream_handle;
|
||||
};
|
||||
|
||||
template <>
|
||||
inline void
|
||||
serialize(uint8_t* buffer, const scratch_memory_sample& item)
|
||||
{
|
||||
utility::store_value(buffer, item.start_timestamp, item.end_timestamp, item.thread_id,
|
||||
item.agent_id_handle, item.queue_id_handle, item.kind,
|
||||
item.operation, item.flags, item.allocation_size,
|
||||
item.correlation_id_internal, item.correlation_id_ancestor,
|
||||
static_cast<uint64_t>(item.stream_handle));
|
||||
}
|
||||
|
||||
template <>
|
||||
inline scratch_memory_sample
|
||||
deserialize(uint8_t*& buffer)
|
||||
{
|
||||
scratch_memory_sample item;
|
||||
uint64_t stream_handle;
|
||||
utility::parse_value(buffer, item.start_timestamp, item.end_timestamp, item.thread_id,
|
||||
item.agent_id_handle, item.queue_id_handle, item.kind,
|
||||
item.operation, item.flags, item.allocation_size,
|
||||
item.correlation_id_internal, item.correlation_id_ancestor,
|
||||
stream_handle);
|
||||
item.stream_handle = stream_handle;
|
||||
return item;
|
||||
}
|
||||
|
||||
template <>
|
||||
inline size_t
|
||||
get_size(const scratch_memory_sample& item)
|
||||
{
|
||||
return utility::get_size(item.start_timestamp, item.end_timestamp, item.thread_id,
|
||||
item.agent_id_handle, item.queue_id_handle, item.kind,
|
||||
item.operation, item.flags, item.allocation_size,
|
||||
item.correlation_id_internal, item.correlation_id_ancestor,
|
||||
static_cast<uint64_t>(item.stream_handle));
|
||||
}
|
||||
|
||||
struct memory_copy_sample : cacheable_t
|
||||
{
|
||||
static constexpr type_identifier_t type_identifier = type_identifier_t::memory_copy;
|
||||
|
||||
@@ -1272,7 +1272,10 @@ shutdown()
|
||||
|
||||
try
|
||||
{
|
||||
data::shutdown();
|
||||
if(data::shutdown())
|
||||
{
|
||||
ROCPROFSYS_AMD_SMI_CALL(amdsmi_shut_down());
|
||||
}
|
||||
} catch(std::runtime_error& _e)
|
||||
{
|
||||
LOG_WARNING("Exception thrown when shutting down amd-smi: {}", _e.what());
|
||||
|
||||
+1
-22
@@ -24,6 +24,7 @@
|
||||
#include "core/common.hpp"
|
||||
#include "core/config.hpp"
|
||||
#include "core/state.hpp"
|
||||
#include "core/timemory.hpp"
|
||||
#include "library/runtime.hpp"
|
||||
|
||||
#include <timemory/backends/threading.hpp>
|
||||
@@ -33,9 +34,7 @@
|
||||
#include "logger/debug.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <unistd.h>
|
||||
|
||||
namespace rocprofsys
|
||||
{
|
||||
@@ -89,26 +88,6 @@ void
|
||||
exit_gotcha::operator()(const gotcha_data& _data, exit_func_t _func, int _ec) const
|
||||
{
|
||||
_exit_info = { true, _data.tool_id.find("quick") != std::string::npos, _ec };
|
||||
|
||||
if(config::get_use_amd_smi())
|
||||
{
|
||||
threading::clear_callbacks();
|
||||
|
||||
if(get_state() < ::rocprofsys::State::Finalized && !is_child_process())
|
||||
{
|
||||
LOG_DEBUG("Finalizing {} before calling {}({})...", get_exe_name(),
|
||||
_data.tool_id, _ec);
|
||||
|
||||
rocprofsys_finalize();
|
||||
}
|
||||
|
||||
LOG_DEBUG("Calling _exit({}) in {} to avoid AMD SMI cleanup issues...", _ec,
|
||||
get_exe_name().c_str());
|
||||
|
||||
std::fflush(nullptr);
|
||||
_exit(_ec);
|
||||
}
|
||||
|
||||
invoke_exit_gotcha(_data, _func, _ec);
|
||||
}
|
||||
|
||||
|
||||
@@ -567,6 +567,18 @@ get_mem_alloc_address(
|
||||
}
|
||||
#endif
|
||||
|
||||
uint64_t
|
||||
get_scratch_mem_alloc_size(
|
||||
[[maybe_unused]] const rocprofiler_buffer_tracing_scratch_memory_record_t& record)
|
||||
{
|
||||
// Scratch memory samples from SDK versions prior to 7.0.2 do not include allocation_size
|
||||
#if(ROCPROFSYS_USE_ROCM > 0 && ROCPROFSYS_ROCM_VERSION >= 70002)
|
||||
return record.allocation_size;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
cache_region(const rocprofiler_callback_tracing_record_t* record,
|
||||
const rocprofiler_timestamp_t start_timestamp,
|
||||
@@ -615,13 +627,26 @@ cache_kernel_dispatch(rocprofiler_buffer_tracing_kernel_dispatch_record_t* recor
|
||||
record->dispatch_info.grid_size.z, stream_handle });
|
||||
}
|
||||
|
||||
void
|
||||
cache_scratch_memory(rocprofiler_buffer_tracing_scratch_memory_record_t* record,
|
||||
uint64_t stream_handle)
|
||||
{
|
||||
trace_cache::get_metadata_registry().add_stream(stream_handle);
|
||||
trace_cache::get_buffer_storage().store(trace_cache::scratch_memory_sample{
|
||||
record->start_timestamp, record->end_timestamp, record->thread_id,
|
||||
record->agent_id.handle, record->queue_id.handle,
|
||||
static_cast<int32_t>(record->kind), static_cast<int32_t>(record->operation),
|
||||
static_cast<int32_t>(record->flags), get_scratch_mem_alloc_size(*record),
|
||||
record->correlation_id.internal, get_parent_stack_id(record->correlation_id),
|
||||
stream_handle });
|
||||
}
|
||||
|
||||
void
|
||||
cache_memory_copy(rocprofiler_buffer_tracing_memory_copy_record_t* record,
|
||||
uint64_t stream_handle)
|
||||
{
|
||||
trace_cache::get_metadata_registry().add_stream(stream_handle);
|
||||
trace_cache::get_buffer_storage().store(trace_cache::memory_copy_sample{
|
||||
|
||||
record->start_timestamp, record->end_timestamp, record->thread_id,
|
||||
record->dst_agent_id.handle, record->src_agent_id.handle,
|
||||
static_cast<int32_t>(record->kind), static_cast<int32_t>(record->operation),
|
||||
@@ -1759,6 +1784,120 @@ tool_tracing_buffered(rocprofiler_context_id_t /*context*/,
|
||||
}
|
||||
}
|
||||
}
|
||||
else if(header->kind == ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY)
|
||||
{
|
||||
auto* record =
|
||||
static_cast<rocprofiler_buffer_tracing_scratch_memory_record_t*>(
|
||||
header->payload);
|
||||
|
||||
bool _group_by_queue = _default_group_by_queue;
|
||||
|
||||
const auto* agent = tool_data->get_gpu_tool_agent(record->agent_id);
|
||||
auto device_id = static_cast<uint32_t>(agent->device_id);
|
||||
|
||||
const auto& t_info = thread_info::get(record->thread_id, SystemTID);
|
||||
auto thread_id_sequent = t_info->index_data->sequent_value;
|
||||
|
||||
auto _corr_id = record->correlation_id.internal;
|
||||
auto _beg_ns = record->start_timestamp;
|
||||
auto _end_ns = record->end_timestamp;
|
||||
auto _name =
|
||||
tool_data->buffered_tracing_info.at(record->kind, record->operation);
|
||||
|
||||
auto _stream_id = get_stream_id(record).handle;
|
||||
if(_stream_id == 0)
|
||||
{
|
||||
// Scratch memory event is not associated with a HIP stream
|
||||
_group_by_queue = true;
|
||||
}
|
||||
|
||||
{
|
||||
auto track_name = JOIN("", "GPU Scratch Memory [", device_id,
|
||||
"] Thread ", record->thread_id);
|
||||
cache_category<category::rocm_scratch_memory>();
|
||||
cache_add_thread_info(record->thread_id);
|
||||
cache_add_track(track_name.c_str(), record->thread_id);
|
||||
cache_scratch_memory(record, _stream_id);
|
||||
}
|
||||
|
||||
if(get_use_timemory())
|
||||
{
|
||||
auto _bundle = kernel_dispatch_bundle_t{ _name };
|
||||
|
||||
_bundle.push(thread_id_sequent).start().stop();
|
||||
_bundle.get([_beg_ns, _end_ns](tim::component::wall_clock* _wc) {
|
||||
_wc->set_value(_end_ns - _beg_ns);
|
||||
_wc->set_accum(_end_ns - _beg_ns);
|
||||
});
|
||||
_bundle.pop();
|
||||
}
|
||||
|
||||
if(get_use_perfetto())
|
||||
{
|
||||
// Scratch memory samples from SDK versions prior to 7.0.2 do not include
|
||||
// allocation_size field, so counter tracks are not needed
|
||||
#if(ROCPROFSYS_USE_ROCM > 0 && ROCPROFSYS_ROCM_VERSION >= 70002)
|
||||
using counter_track = perfetto_counter_track<
|
||||
rocprofiler_buffer_tracing_scratch_memory_record_t>;
|
||||
|
||||
if(!counter_track::exists(device_id))
|
||||
{
|
||||
auto track_name_alloc_size =
|
||||
JOIN("", "GPU Scratch Memory [", device_id, "] (S) Thread ",
|
||||
thread_id_sequent);
|
||||
counter_track::emplace(device_id, track_name_alloc_size, "bytes");
|
||||
}
|
||||
|
||||
if(record->operation == ROCPROFILER_SCRATCH_MEMORY_ALLOC)
|
||||
{
|
||||
TRACE_COUNTER("rocm_scratch_memory",
|
||||
counter_track::at(device_id, 0), _beg_ns,
|
||||
record->allocation_size);
|
||||
}
|
||||
#endif
|
||||
auto add_perfetto_annotations = [&](::perfetto::EventContext ctx) {
|
||||
if(config::get_perfetto_annotations())
|
||||
{
|
||||
tracing::add_perfetto_annotation(ctx, "begin_ns", _beg_ns);
|
||||
tracing::add_perfetto_annotation(ctx, "end_ns", _end_ns);
|
||||
tracing::add_perfetto_annotation(ctx, "corr_id", _corr_id);
|
||||
tracing::add_perfetto_annotation(ctx, "stream_id",
|
||||
_stream_id);
|
||||
}
|
||||
};
|
||||
|
||||
if(_group_by_queue)
|
||||
{
|
||||
auto track_name_events = [&]() {
|
||||
return JOIN("", "GPU Scratch Memory (S) Events Thread ",
|
||||
thread_id_sequent);
|
||||
};
|
||||
const auto _track = tracing::get_perfetto_track(
|
||||
category::rocm_scratch_memory{}, track_name_events);
|
||||
|
||||
tracing::push_perfetto(category::rocm_scratch_memory{},
|
||||
_name.data(), _track, _beg_ns,
|
||||
::perfetto::Flow::ProcessScoped(_corr_id),
|
||||
add_perfetto_annotations);
|
||||
|
||||
tracing::pop_perfetto(category::rocm_scratch_memory{}, "", _track,
|
||||
_end_ns);
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto _track = tracing::get_perfetto_track(
|
||||
category::rocm_hip_stream{}, _track_desc_stream, _stream_id);
|
||||
|
||||
tracing::push_perfetto(category::rocm_hip_stream{}, _name.data(),
|
||||
_track, _beg_ns,
|
||||
::perfetto::Flow::ProcessScoped(_corr_id),
|
||||
add_perfetto_annotations);
|
||||
|
||||
tracing::pop_perfetto(category::rocm_hip_stream{}, "", _track,
|
||||
_end_ns);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if(header->kind == ROCPROFILER_BUFFER_TRACING_MEMORY_COPY)
|
||||
{
|
||||
auto* record =
|
||||
@@ -2249,6 +2388,17 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* user_data)
|
||||
_data->primary_ctx, ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, nullptr, 0,
|
||||
_data->memory_copy_buffer));
|
||||
}
|
||||
if(_buffered_domain.count(ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY) > 0)
|
||||
{
|
||||
ROCPROFILER_CALL(rocprofiler_create_buffer(
|
||||
_data->primary_ctx, buffer_size, watermark,
|
||||
ROCPROFILER_BUFFER_POLICY_LOSSLESS, tool_tracing_buffered, tool_data,
|
||||
&_data->scratch_memory_buffer));
|
||||
|
||||
ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service(
|
||||
_data->primary_ctx, ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY, nullptr, 0,
|
||||
_data->scratch_memory_buffer));
|
||||
}
|
||||
|
||||
#if(ROCPROFILER_VERSION >= 600)
|
||||
if(_buffered_domain.count(ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION) > 0)
|
||||
|
||||
+5
-7
@@ -122,7 +122,7 @@ using backtrace_operation_map_t =
|
||||
|
||||
struct client_data
|
||||
{
|
||||
static constexpr size_t num_buffers = 4;
|
||||
static constexpr size_t num_buffers = 5;
|
||||
static constexpr size_t num_contexts = 2;
|
||||
|
||||
using buffer_name_info_t = rocprofiler::sdk::buffer_name_info_t<std::string_view>;
|
||||
@@ -138,6 +138,7 @@ struct client_data
|
||||
rocprofiler_context_id_t primary_ctx = { 0 };
|
||||
rocprofiler_context_id_t counter_ctx = { 0 };
|
||||
rocprofiler_buffer_id_t kernel_dispatch_buffer = { 0 };
|
||||
rocprofiler_buffer_id_t scratch_memory_buffer = { 0 };
|
||||
rocprofiler_buffer_id_t memory_copy_buffer = { 0 };
|
||||
rocprofiler_buffer_id_t memory_alloc_buffer = { 0 };
|
||||
rocprofiler_buffer_id_t counter_collection_buffer = { 0 };
|
||||
@@ -179,12 +180,9 @@ client_data::get_contexts() const
|
||||
inline client_data::buffer_id_vec_t
|
||||
client_data::get_buffers() const
|
||||
{
|
||||
return buffer_id_vec_t{
|
||||
kernel_dispatch_buffer,
|
||||
memory_copy_buffer,
|
||||
memory_alloc_buffer,
|
||||
counter_collection_buffer,
|
||||
};
|
||||
return buffer_id_vec_t{ kernel_dispatch_buffer, scratch_memory_buffer,
|
||||
memory_copy_buffer, memory_alloc_buffer,
|
||||
counter_collection_buffer };
|
||||
}
|
||||
|
||||
inline const rocprofsys_agent_t*
|
||||
|
||||
@@ -47,7 +47,6 @@ from .parser import parse_uploaded_file
|
||||
from .parser import find_causal_files
|
||||
import plotly.graph_objects as go
|
||||
|
||||
|
||||
file_timestamp = 0
|
||||
global_data = pd.DataFrame()
|
||||
global_samples = pd.DataFrame()
|
||||
|
||||
@@ -37,7 +37,6 @@ from . import libpyrocprofsys
|
||||
from .libpyrocprofsys.profiler import profiler_init as _profiler_init
|
||||
from .libpyrocprofsys.profiler import profiler_finalize as _profiler_fini
|
||||
|
||||
|
||||
__all__ = ["exec_", "_file", "_get_argv", "_initialize", "_finalize"]
|
||||
|
||||
|
||||
|
||||
@@ -44,7 +44,6 @@ from .libpyrocprofsys.user import pop_region
|
||||
from .common import _initialize
|
||||
from .common import _file
|
||||
|
||||
|
||||
__all__ = [
|
||||
"region",
|
||||
"Region",
|
||||
|
||||
Alguns arquivos não foram exibidos porque demasiados arquivos foram alterados neste diff Mostrar Mais
Referência em uma Nova Issue
Bloquear um usuário