Merge branch 'develop' into amd/dev/gandryey/SWDEV-558849

This commit is contained in:
German Andryeyev
2026-01-20 12:04:53 -05:00
gecommit door GitHub
bovenliggende 07a6b45535 4a5cbbfba5
commit 3af2bf4952
103 gewijzigde bestanden met toevoegingen van 2367 en 528 verwijderingen
@@ -41,7 +41,7 @@ jobs:
working-directory: projects/rocprofiler-compute
run: |
python -m pip install --upgrade pip
python -m pip install ruff
python -m pip install ruff==0.14.11
if [ -f requirements.txt ]; then python -m pip install -r requirements.txt; fi
- name: Run Ruff Linter and Import Sorter
run: |
@@ -226,6 +226,12 @@ class AMDSMICommands():
# Set args.* to passed in arguments
if gpu:
args.gpu = gpu
cpu_attributes = ["cpu"]
for attr in cpu_attributes:
if hasattr(args, 'cpu') and getattr(args, 'cpu'):
print("N/A")
return
# Handle No GPU passed
if args.gpu == None:
+1
Bestand weergeven
@@ -8,6 +8,7 @@ Full documentation for HIP is available at [rocm.docs.amd.com](https://rocm.docs
* New HIP APIs
- `hipKernelGetParamInfo` returns the offset and size of a kernel parameter
* Support for `barrier_arrive` and `barrier_wait` for `grid_group` and `thread_block`.
* New HIP supports
- `grid_group::block_rank()` returns the rank of the block in the calling thread
@@ -111,9 +111,8 @@
#if !defined(__HIPCC_RTC__)
#include <hip/amd_detail/amd_hip_common.h>
#include "amd_hip_vector_types.h" // float2 etc
#include "device_library_decls.h" // ocml conversion functions
#include "math_fwd.h" // ocml device functions
#if defined(__clang__) && defined(__HIP__)
#include "math_fwd.h" // ocml device functions
#include <hip/amd_detail/amd_warp_functions.h> // define warpSize
#include <hip/amd_detail/amd_warp_sync_functions.h> // Sync functions
#endif
@@ -338,7 +337,11 @@ struct __attribute__((aligned(2))) __hip_bfloat16 {
};
/**@}*/
#if defined(__clang__)
typedef __bf16 __bf16_2 __attribute__((ext_vector_type(2)));
#else
typedef __bf16 __bf16_2 __attribute__((vector_size(sizeof(__bf16) * 2)));
#endif
/**
* \defgroup HIP_INTRINSIC_BFLOAT162_STRUCT
@@ -350,6 +353,7 @@ struct __attribute__((aligned(4))) __hip_bfloat162 {
static_assert(sizeof(__hip_bfloat16[2]) == sizeof(__bf16_2));
public:
#if defined(__clang__)
union {
struct {
__hip_bfloat16 x; /*! \brief raw representation of bfloat16 */
@@ -357,7 +361,12 @@ struct __attribute__((aligned(4))) __hip_bfloat162 {
};
__bf16_2 __xy_bf162;
};
#else
/* GCC does not support anonymous structs with members that have non-trivial constructors (Clang
allows this as an extension). Expose x and y directly instead. */
__hip_bfloat16 x;
__hip_bfloat16 y;
#endif
public:
/*! \brief create __hip_bfloat162 from __hip_bfloat162_raw */
@@ -373,7 +382,11 @@ struct __attribute__((aligned(4))) __hip_bfloat162 {
: x(a), y(b) {}
/*! \brief create __hip_bfloat162 from vector of __bf16_2 */
#if defined(__clang__)
__BF16_HOST_DEVICE__ __hip_bfloat162(const __bf16_2 in) : __xy_bf162(in) {}
#else
__BF16_HOST_DEVICE__ __hip_bfloat162(const __bf16_2 in) : x{in[0]}, y{in[1]} {}
#endif
/*! \brief default constructor of __hip_bfloat162 */
__BF16_HOST_DEVICE__ __hip_bfloat162() = default;
@@ -392,11 +405,22 @@ struct __attribute__((aligned(4))) __hip_bfloat162 {
}
/*! \brief return a vector of bf16 */
__BF16_HOST_DEVICE__ operator __bf16_2() const { return __xy_bf162; }
__BF16_HOST_DEVICE__ operator __bf16_2() const {
#if defined(__clang__)
return __xy_bf162;
#else
return __bf16_2{x, y};
#endif
}
/*! \brief return a vector of bf16 */
__BF16_HOST_DEVICE__ __hip_bfloat162& operator=(const __bf16_2 in) {
#if defined(__clang__)
__xy_bf162 = in;
#else
x = __hip_bfloat16{in[0]};
y = __hip_bfloat16{in[1]};
#endif
return *this;
}
@@ -835,6 +859,7 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat16 __hdiv(const __hip_bfloat16 a, const
return (__bf16)a / (__bf16)b;
}
#if defined(__clang__) && defined(__HIP__)
/**
* \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
* \brief Performs FMA of given bfloat16 values
@@ -844,6 +869,7 @@ __BF16_DEVICE_STATIC__ __hip_bfloat16 __hfma(const __hip_bfloat16 a, const __hip
return __hip_bfloat16(__builtin_elementwise_fma(__bf16(a), __bf16(b), __bf16(c)));
;
}
#endif
/**
* \ingroup HIP_INTRINSIC_BFLOAT16_ARITH
@@ -919,6 +945,8 @@ __BF16_HOST_DEVICE_STATIC__ __hip_bfloat162 __hadd2_rn(const __hip_bfloat162 a,
return __hip_bfloat162{__bf16_2(a) + __bf16_2(b)};
}
#if defined(__clang__) && defined(__HIP__)
/**
* \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
* \brief Performs FMA of given bfloat162 values
@@ -927,6 +955,7 @@ __BF16_DEVICE_STATIC__ __hip_bfloat162 __hfma2(const __hip_bfloat162 a, const __
const __hip_bfloat162 c) {
return __hip_bfloat162{__builtin_elementwise_fma(__bf16_2(a), __bf16_2(b), __bf16_2(c))};
}
#endif
/**
* \ingroup HIP_INTRINSIC_BFLOAT162_ARITH
@@ -1639,6 +1668,7 @@ __BF16_HOST_DEVICE_STATIC__ bool operator>=(const __hip_bfloat162& l, const __hi
return fl.x >= fr.x && fl.x >= fr.y;
}
#if defined(__clang__) && defined(__HIP__)
/**
* \ingroup HIP_INTRINSIC_BFLOAT16_MATH
* \brief Calculate ceil of bfloat16
@@ -1883,7 +1913,6 @@ __BF16_DEVICE_STATIC__ __hip_bfloat162 h2trunc(const __hip_bfloat162 h) {
return __hip_bfloat162(htrunc(h.x), htrunc(h.y));
}
#if defined(__clang__) && defined(__HIP__)
/**
* \ingroup HIP_INTRINSIC_BFLOAT162_MATH
* \brief Atomic add bfloat162
@@ -212,6 +212,19 @@ class grid_group : public thread_group {
//! @copydoc thread_group::sync
__CG_QUALIFIER__ void sync() const { internal::grid::sync(); }
__CG_QUALIFIER__ dim3 group_dim() const { return internal::grid::grid_dim(); }
struct arrival_token {
unsigned int signal;
};
//! Arrive at a barrier
__CG_QUALIFIER__ arrival_token barrier_arrive() const {
arrival_token t;
t.signal = internal::grid::barrier_signal();
return t;
}
//! Arrive at a barrier
__CG_QUALIFIER__ void barrier_wait(arrival_token&& t) const {
internal::grid::barrier_wait(t.signal);
}
};
/** \ingroup CooperativeGConstruct
@@ -295,6 +308,14 @@ class thread_block : public thread_group {
__CG_STATIC_QUALIFIER__ void sync() { internal::workgroup::sync(); }
//! Returns the group dimensions.
__CG_QUALIFIER__ dim3 group_dim() { return internal::workgroup::block_dim(); }
struct arrival_token {};
//! Arrive at a barrier
__CG_QUALIFIER__ arrival_token barrier_arrive() const {
internal::workgroup::barrier_arrive();
return arrival_token{};
}
//! Arrive at a barrier
__CG_QUALIFIER__ void barrier_wait(arrival_token&&) const { internal::workgroup::barrier_wait(); }
};
/** \ingroup CooperativeGConstruct
@@ -97,6 +97,8 @@ extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_size(void);
extern "C" __device__ __attribute__((const)) uint __ockl_multi_grid_thread_rank(void);
extern "C" __device__ __attribute__((const)) int __ockl_multi_grid_is_valid(void);
extern "C" __device__ __attribute__((convergent)) void __ockl_multi_grid_sync(void);
extern "C" __device__ __attribute__((const)) uint __ockl_grid_bar_arrive(void);
extern "C" __device__ __attribute__((convergent)) void __ockl_grid_bar_wait(uint);
extern "C" __device__ void __ockl_atomic_add_noret_f32(float*, float);
@@ -63,7 +63,7 @@
#define HIP_API_TABLE_STEP_VERSION 0
#define HIP_COMPILER_API_TABLE_STEP_VERSION 0
#define HIP_TOOLS_API_TABLE_STEP_VERSION 0
#define HIP_RUNTIME_API_TABLE_STEP_VERSION 20
#define HIP_RUNTIME_API_TABLE_STEP_VERSION 21
// HIP API interface
// HIP compiler dispatch functions
@@ -1113,6 +1113,9 @@ typedef hipError_t (*t_hipKernelGetLibrary)(hipLibrary_t* library, hipKernel_t k
typedef hipError_t (*t_hipKernelGetName)(const char** name, hipKernel_t kernel);
typedef hipError_t (*t_hipGetProcAddress_spt)(const char* symbol, void** pfn, int hipVersion, uint64_t flags,
hipDriverProcAddressQueryResult* symbolStatus);
typedef hipError_t (*t_hipExtDisableLogging)();
typedef hipError_t (*t_hipExtEnableLogging)();
typedef hipError_t (*t_hipExtSetLoggingParams)(size_t log_level, size_t log_size, size_t log_mask);
typedef hipError_t (*t_hipKernelGetParamInfo)(hipKernel_t kernel, size_t paramIndex,
size_t* paramOffset, size_t* paramSize);
@@ -1707,8 +1710,13 @@ struct HipDispatchTable {
// HIP_RUNTIME_API_TABLE_STEP_VERSION == 20
t_hipKernelGetParamInfo hipKernelGetParamInfo_fn;
// DO NOT EDIT ABOVE!
// HIP_RUNTIME_API_TABLE_STEP_VERSION == 21
t_hipExtDisableLogging hipExtDisableLogging_fn;
t_hipExtEnableLogging hipExtEnableLogging_fn;
t_hipExtSetLoggingParams hipExtSetLoggingParams_fn;
// DO NOT EDIT ABOVE!
// HIP_RUNTIME_API_TABLE_STEP_VERSION == 22
// ******************************************************************************************* //
//
@@ -196,6 +196,9 @@ __CG_STATIC_QUALIFIER__ dim3 grid_dim() {
static_cast<__hip_uint32_t>(gridDim.z)));
}
__CG_STATIC_QUALIFIER__ unsigned int barrier_signal() { return __ockl_grid_bar_arrive(); }
__CG_STATIC_QUALIFIER__ void barrier_wait(unsigned int s) { __ockl_grid_bar_wait(s); }
} // namespace grid
/**
@@ -238,6 +241,23 @@ __CG_STATIC_QUALIFIER__ dim3 block_dim() {
static_cast<__hip_uint32_t>(blockDim.z)));
}
__CG_STATIC_QUALIFIER__ void barrier_arrive() {
__builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
#if __has_builtin(__builtin_amdgcn_s_barrier_signal) && \
__has_builtin(__builtin_amdgcn_s_barrier_wait)
__builtin_amdgcn_s_barrier_signal(-1);
#endif // __builtin_amdgcn_s_barrier_signal && __builtin_amdgcn_s_barrier_wait
}
__CG_STATIC_QUALIFIER__ void barrier_wait() {
#if __has_builtin(__builtin_amdgcn_s_barrier_signal) && \
__has_builtin(__builtin_amdgcn_s_barrier_wait)
__builtin_amdgcn_s_barrier_wait(-1);
#else
__builtin_amdgcn_s_barrier();
#endif // __builtin_amdgcn_s_barrier_signal && __builtin_amdgcn_s_barrier_wait
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
}
} // namespace workgroup
namespace tiled_group {
@@ -468,7 +468,10 @@ enum hip_api_id_t {
HIP_API_ID_hipKernelGetName = 448,
HIP_API_ID_hipOccupancyAvailableDynamicSMemPerBlock = 449,
HIP_API_ID_hipKernelGetParamInfo = 450,
HIP_API_ID_LAST = 450,
HIP_API_ID_hipExtDisableLogging = 451,
HIP_API_ID_hipExtEnableLogging = 452,
HIP_API_ID_hipExtSetLoggingParams = 453,
HIP_API_ID_LAST = 453,
HIP_API_ID_hipChooseDevice = HIP_API_ID_CONCAT(HIP_API_ID_,hipChooseDevice),
HIP_API_ID_hipGetDeviceProperties = HIP_API_ID_CONCAT(HIP_API_ID_,hipGetDeviceProperties),
@@ -590,12 +593,15 @@ static inline const char* hip_api_name(const uint32_t id) {
case HIP_API_ID_hipEventRecord: return "hipEventRecord";
case HIP_API_ID_hipEventRecordWithFlags: return "hipEventRecordWithFlags";
case HIP_API_ID_hipEventSynchronize: return "hipEventSynchronize";
case HIP_API_ID_hipExtDisableLogging: return "hipExtDisableLogging";
case HIP_API_ID_hipExtEnableLogging: return "hipExtEnableLogging";
case HIP_API_ID_hipExtGetLastError: return "hipExtGetLastError";
case HIP_API_ID_hipExtGetLinkTypeAndHopCount: return "hipExtGetLinkTypeAndHopCount";
case HIP_API_ID_hipExtLaunchKernel: return "hipExtLaunchKernel";
case HIP_API_ID_hipExtLaunchMultiKernelMultiDevice: return "hipExtLaunchMultiKernelMultiDevice";
case HIP_API_ID_hipExtMallocWithFlags: return "hipExtMallocWithFlags";
case HIP_API_ID_hipExtModuleLaunchKernel: return "hipExtModuleLaunchKernel";
case HIP_API_ID_hipExtSetLoggingParams: return "hipExtSetLoggingParams";
case HIP_API_ID_hipExtStreamCreateWithCUMask: return "hipExtStreamCreateWithCUMask";
case HIP_API_ID_hipExtStreamGetCUMask: return "hipExtStreamGetCUMask";
case HIP_API_ID_hipExternalMemoryGetMappedBuffer: return "hipExternalMemoryGetMappedBuffer";
@@ -1034,12 +1040,15 @@ static inline uint32_t hipApiIdByName(const char* name) {
if (strcmp("hipEventRecord", name) == 0) return HIP_API_ID_hipEventRecord;
if (strcmp("hipEventRecordWithFlags", name) == 0) return HIP_API_ID_hipEventRecordWithFlags;
if (strcmp("hipEventSynchronize", name) == 0) return HIP_API_ID_hipEventSynchronize;
if (strcmp("hipExtDisableLogging", name) == 0) return HIP_API_ID_hipExtDisableLogging;
if (strcmp("hipExtEnableLogging", name) == 0) return HIP_API_ID_hipExtEnableLogging;
if (strcmp("hipExtGetLastError", name) == 0) return HIP_API_ID_hipExtGetLastError;
if (strcmp("hipExtGetLinkTypeAndHopCount", name) == 0) return HIP_API_ID_hipExtGetLinkTypeAndHopCount;
if (strcmp("hipExtLaunchKernel", name) == 0) return HIP_API_ID_hipExtLaunchKernel;
if (strcmp("hipExtLaunchMultiKernelMultiDevice", name) == 0) return HIP_API_ID_hipExtLaunchMultiKernelMultiDevice;
if (strcmp("hipExtMallocWithFlags", name) == 0) return HIP_API_ID_hipExtMallocWithFlags;
if (strcmp("hipExtModuleLaunchKernel", name) == 0) return HIP_API_ID_hipExtModuleLaunchKernel;
if (strcmp("hipExtSetLoggingParams", name) == 0) return HIP_API_ID_hipExtSetLoggingParams;
if (strcmp("hipExtStreamCreateWithCUMask", name) == 0) return HIP_API_ID_hipExtStreamCreateWithCUMask;
if (strcmp("hipExtStreamGetCUMask", name) == 0) return HIP_API_ID_hipExtStreamGetCUMask;
if (strcmp("hipExternalMemoryGetMappedBuffer", name) == 0) return HIP_API_ID_hipExternalMemoryGetMappedBuffer;
@@ -1851,6 +1860,11 @@ typedef struct hip_api_data_s {
hipEvent_t stopEvent;
unsigned int flags;
} hipExtModuleLaunchKernel;
struct {
size_t log_level;
size_t log_size;
size_t log_mask;
} hipExtSetLoggingParams;
struct {
hipStream_t* stream;
hipStream_t stream__val;
@@ -4484,6 +4498,12 @@ typedef struct hip_api_data_s {
#define INIT_hipEventSynchronize_CB_ARGS_DATA(cb_data) { \
cb_data.args.hipEventSynchronize.event = (hipEvent_t)event; \
};
// hipExtDisableLogging[]
#define INIT_hipExtDisableLogging_CB_ARGS_DATA(cb_data) { \
};
// hipExtEnableLogging[]
#define INIT_hipExtEnableLogging_CB_ARGS_DATA(cb_data) { \
};
// hipExtGetLastError[]
#define INIT_hipExtGetLastError_CB_ARGS_DATA(cb_data) { \
};
@@ -4535,6 +4555,12 @@ typedef struct hip_api_data_s {
cb_data.args.hipExtModuleLaunchKernel.stopEvent = (hipEvent_t)stopEvent; \
cb_data.args.hipExtModuleLaunchKernel.flags = (unsigned int)flags; \
};
// hipExtSetLoggingParams[('size_t', 'log_level'), ('size_t', 'log_size'), ('size_t', 'log_mask')]
#define INIT_hipExtSetLoggingParams_CB_ARGS_DATA(cb_data) { \
cb_data.args.hipExtSetLoggingParams.log_level = (size_t)log_level; \
cb_data.args.hipExtSetLoggingParams.log_size = (size_t)log_size; \
cb_data.args.hipExtSetLoggingParams.log_mask = (size_t)log_mask; \
};
// hipExtStreamCreateWithCUMask[('hipStream_t*', 'stream'), ('unsigned int', 'cuMaskSize'), ('const unsigned int*', 'cuMask')]
#define INIT_hipExtStreamCreateWithCUMask_CB_ARGS_DATA(cb_data) { \
cb_data.args.hipExtStreamCreateWithCUMask.stream = (hipStream_t*)stream; \
@@ -7125,6 +7151,12 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) {
// hipEventSynchronize[('hipEvent_t', 'event')]
case HIP_API_ID_hipEventSynchronize:
break;
// hipExtDisableLogging[]
case HIP_API_ID_hipExtDisableLogging:
break;
// hipExtEnableLogging[]
case HIP_API_ID_hipExtEnableLogging:
break;
// hipExtGetLastError[]
case HIP_API_ID_hipExtGetLastError:
break;
@@ -7150,6 +7182,9 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) {
if (data->args.hipExtModuleLaunchKernel.kernelParams) data->args.hipExtModuleLaunchKernel.kernelParams__val = *(data->args.hipExtModuleLaunchKernel.kernelParams);
if (data->args.hipExtModuleLaunchKernel.extra) data->args.hipExtModuleLaunchKernel.extra__val = *(data->args.hipExtModuleLaunchKernel.extra);
break;
// hipExtSetLoggingParams[('size_t', 'log_level'), ('size_t', 'log_size'), ('size_t', 'log_mask')]
case HIP_API_ID_hipExtSetLoggingParams:
break;
// hipExtStreamCreateWithCUMask[('hipStream_t*', 'stream'), ('unsigned int', 'cuMaskSize'), ('const unsigned int*', 'cuMask')]
case HIP_API_ID_hipExtStreamCreateWithCUMask:
if (data->args.hipExtStreamCreateWithCUMask.stream) data->args.hipExtStreamCreateWithCUMask.stream__val = *(data->args.hipExtStreamCreateWithCUMask.stream);
@@ -9124,6 +9159,14 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da
oss << "event="; roctracer::hip_support::detail::operator<<(oss, data->args.hipEventSynchronize.event);
oss << ")";
break;
case HIP_API_ID_hipExtDisableLogging:
oss << "hipExtDisableLogging(";
oss << ")";
break;
case HIP_API_ID_hipExtEnableLogging:
oss << "hipExtEnableLogging(";
oss << ")";
break;
case HIP_API_ID_hipExtGetLastError:
oss << "hipExtGetLastError(";
oss << ")";
@@ -9188,6 +9231,13 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da
oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtModuleLaunchKernel.flags);
oss << ")";
break;
case HIP_API_ID_hipExtSetLoggingParams:
oss << "hipExtSetLoggingParams(";
oss << "log_level="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtSetLoggingParams.log_level);
oss << ", log_size="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtSetLoggingParams.log_size);
oss << ", log_mask="; roctracer::hip_support::detail::operator<<(oss, data->args.hipExtSetLoggingParams.log_mask);
oss << ")";
break;
case HIP_API_ID_hipExtStreamCreateWithCUMask:
oss << "hipExtStreamCreateWithCUMask(";
if (data->args.hipExtStreamCreateWithCUMask.stream == NULL) oss << "stream=NULL";
@@ -110,6 +110,7 @@ target_sources(amdhip64 PRIVATE
hip_graph.cpp
hip_hmm.cpp
hip_intercept.cpp
hip_log.cpp
hip_memory.cpp
hip_mempool.cpp
hip_mempool_impl.cpp
@@ -523,3 +523,6 @@ hipKernelGetName
hipOccupancyAvailableDynamicSMemPerBlock
hipGetProcAddress_spt
hipKernelGetParamInfo
hipExtDisableLogging
hipExtEnableLogging
hipExtSetLoggingParams
@@ -885,6 +885,9 @@ hipError_t hipOccupancyAvailableDynamicSMemPerBlock(size_t* dynamicSmemSize, con
int numBlocks, int blockSize);
hipError_t hipKernelGetParamInfo(hipKernel_t kernel, size_t paramIndex, size_t* paramOffset,
size_t* paramSize);
hipError_t hipExtDisableLogging();
hipError_t hipExtEnableLogging();
hipError_t hipExtSetLoggingParams(size_t log_level, size_t log_size, size_t log_mask);
} // namespace hip
namespace hip {
@@ -1432,6 +1435,9 @@ void UpdateDispatchTable(HipDispatchTable* ptrDispatchTable) {
ptrDispatchTable->hipKernelGetName_fn = hip::hipKernelGetName;
ptrDispatchTable->hipOccupancyAvailableDynamicSMemPerBlock_fn = hip::hipOccupancyAvailableDynamicSMemPerBlock;
ptrDispatchTable->hipKernelGetParamInfo_fn = hip::hipKernelGetParamInfo;
ptrDispatchTable->hipExtDisableLogging_fn = hip::hipExtDisableLogging;
ptrDispatchTable->hipExtEnableLogging_fn = hip::hipExtEnableLogging;
ptrDispatchTable->hipExtSetLoggingParams_fn = hip::hipExtSetLoggingParams;
}
#if HIP_ROCPROFILER_REGISTER > 0
@@ -2114,15 +2120,19 @@ HIP_ENFORCE_ABI(HipDispatchTable, hipOccupancyAvailableDynamicSMemPerBlock_fn, 5
HIP_ENFORCE_ABI(HipDispatchTable, hipGetProcAddress_spt_fn, 506);
// HIP_RUNTIME_API_TABLE_STEP_VERSION == 20
HIP_ENFORCE_ABI(HipDispatchTable, hipKernelGetParamInfo_fn, 507);
// HIP_RUNTIME_API_TABLE_STEP_VERSION == 21
HIP_ENFORCE_ABI(HipDispatchTable, hipExtDisableLogging_fn, 508);
HIP_ENFORCE_ABI(HipDispatchTable, hipExtEnableLogging_fn, 509);
HIP_ENFORCE_ABI(HipDispatchTable, hipExtSetLoggingParams_fn, 510);
// if HIP_ENFORCE_ABI entries are added for each new function pointer in the table, the number below
// will be +1 of the number in the last HIP_ENFORCE_ABI line. E.g.:
//
// HIP_ENFORCE_ABI(<table>, <functor>, 8)
//
// HIP_ENFORCE_ABI_VERSIONING(<table>, 9) <- 8 + 1 = 9
HIP_ENFORCE_ABI_VERSIONING(HipDispatchTable, 508)
HIP_ENFORCE_ABI_VERSIONING(HipDispatchTable, 511)
static_assert(HIP_RUNTIME_API_TABLE_MAJOR_VERSION == 0 && HIP_RUNTIME_API_TABLE_STEP_VERSION == 20,
static_assert(HIP_RUNTIME_API_TABLE_MAJOR_VERSION == 0 && HIP_RUNTIME_API_TABLE_STEP_VERSION == 21,
"If you get this error, add new HIP_ENFORCE_ABI(...) code for the new function "
"pointers and then update this check so it is true");
#endif
@@ -645,6 +645,9 @@ global:
hipOccupancyAvailableDynamicSMemPerBlock;
hipGetProcAddress_spt;
hipKernelGetParamInfo;
hipExtDisableLogging;
hipExtEnableLogging;
hipExtSetLoggingParams;
local:
*;
} hip_7.1;
@@ -0,0 +1,31 @@
#include <hip/hip_runtime.h>
#include "hip_internal.hpp"
#include "hip_platform.hpp"
namespace hip {
hipError_t hipExtEnableLogging() {
HIP_INIT_API(hipExtEnableLogging);
amd::ScopedLock lock(PlatformState::instance().getLogLock());
AMD_LOG_LEVEL = PlatformState::instance().log_level_;
AMD_LOG_MASK = PlatformState::instance().log_mask_;
HIP_RETURN(hipSuccess);
}
hipError_t hipExtDisableLogging() {
HIP_INIT_API(hipExtDisableLogging);
amd::ScopedLock lock(PlatformState::instance().getLogLock());
AMD_LOG_LEVEL = 0;
HIP_RETURN(hipSuccess);
}
hipError_t hipExtSetLoggingParams(size_t log_level, size_t log_size, size_t log_mask) {
HIP_INIT_API(hipExtSetLoggingParams, log_level, log_size, log_mask);
amd::ScopedLock lock(PlatformState::instance().getLogLock());
// Store logging parameters for later activation
PlatformState::instance().log_level_ = log_level;
PlatformState::instance().log_size_ = log_size;
PlatformState::instance().log_mask_ = log_mask;
HIP_RETURN(hipSuccess);
}
} // namespace::hip
@@ -50,9 +50,12 @@ class PlatformState {
// Unique FD Store Lock
amd::Monitor ufd_lock_{true};
// Lock for logging operations
amd::Monitor lg_lock_{true};
// Singleton object
static PlatformState* platform_;
PlatformState() {}
PlatformState() : log_level_(0), log_size_(0), log_mask_(0) {}
~PlatformState() {}
public:
@@ -113,6 +116,14 @@ class PlatformState {
size_t UfdMapSize() const { return ufd_map_.size(); }
// Logging lock accessor
amd::Monitor& getLogLock() { return lg_lock_; }
// Friend functions for logging access
friend hipError_t hipExtEnableLogging();
friend hipError_t hipExtDisableLogging();
friend hipError_t hipExtSetLoggingParams(size_t log_level, size_t log_size, size_t log_mask);
inline bool RegisterLibraryFunction(const hipKernel_t f, const hipLibrary_t l) {
amd::ScopedLock lock(lock_);
if (library_functions_.find(f) == library_functions_.end()) {
@@ -150,5 +161,10 @@ class PlatformState {
void* dynamicLibraryHandle_{nullptr};
std::unordered_map<hipKernel_t, hipLibrary_t> library_functions_;
// Logging state (moved from LoggingInfo singleton)
size_t log_level_;
size_t log_size_;
size_t log_mask_;
};
} // namespace hip
@@ -2067,4 +2067,13 @@ hipError_t hipKernelGetParamInfo(hipKernel_t kernel, size_t paramIndex, size_t*
size_t* paramSize) {
return hip::GetHipDispatchTable()->hipKernelGetParamInfo_fn(kernel, paramIndex, paramOffset,
paramSize);
}
hipError_t hipExtEnableLogging() {
return hip::GetHipDispatchTable()->hipExtEnableLogging_fn();
}
hipError_t hipExtDisableLogging() {
return hip::GetHipDispatchTable()->hipExtDisableLogging_fn();
}
hipError_t hipExtSetLoggingParams(size_t log_level, size_t log_size, size_t log_mask) {
return hip::GetHipDispatchTable()->hipExtSetLoggingParams_fn(log_level, log_size, log_mask);
}
@@ -138,14 +138,6 @@
"=== TODO ===",
"Unit_Device_tgammaf_Accuracy_Limited_Positive",
"=== TODO === fail on 100% test data",
"Unit_Device_hexp10_Accuracy_Positive",
"Unit_Device_h2exp10_Accuracy_Positive",
"Unit_Device_hexp2_Accuracy_Positive",
"Unit_Device_h2exp2_Accuracy_Positive",
"Unit_Device_hlog_Accuracy_Positive",
"Unit_Device_h2log_Accuracy_Positive",
"Unit_Device_hlog10_Accuracy_Positive",
"Unit_Device_h2log10_Accuracy_Positive",
"Unit_Device___hfma2_Accuracy_Positive",
#endif
#if defined gfx90a || defined gfx942 || defined gfx950
@@ -752,6 +752,8 @@
#endif
"=== Following tests disabled as it should be a local perf test",
"Performance_hipExtLaunchKernelGGL_QueryGPUFrequency",
"Unit_hipDynamicLogging_Positive_Basic",
"Unit_hipDynamicLogging_Positive_MultipleEnableDisable",
"End of json"
]
}
@@ -20,6 +20,7 @@ set(TEST_SRC
binary_partition.cc
cg_ballot.cc
cg_any_all.cc
split_barrier.cc
)
if(HIP_PLATFORM STREQUAL "nvidia")
@@ -0,0 +1,123 @@
/*
Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip/hip_cooperative_groups.h>
#include <hip_test_common.hh>
static __global__ void wg_split_barrier(float *out, float *in) {
namespace cg = cooperative_groups;
__shared__ float mid[32];
size_t i = threadIdx.x;
auto tb = cg::this_thread_block();
out[i] = in[i] * 2.0f;
auto tok = tb.barrier_arrive();
// use tid 0 to populate shared mem
if (i == 0) {
for (size_t j = 0; j < 32; j++) {
mid[j] = in[j];
}
}
tb.barrier_wait(std::move(tok));
out[i] += mid[i];
}
TEST_CASE("Unit_coop_thread_block_split_barrier") {
constexpr size_t size = 32;
float *d_out, *d_in;
HIP_CHECK(hipMalloc(&d_out, sizeof(float) * size));
HIP_CHECK(hipMalloc(&d_in, sizeof(float) * size));
std::vector<float> in(size, 0.0f), out = in;
for (size_t i = 0; i < size; i++) {
in[i] = i + 1;
}
HIP_CHECK(hipMemset(d_out, 0, sizeof(float) * size));
HIP_CHECK(
hipMemcpy(d_in, in.data(), sizeof(float) * size, hipMemcpyHostToDevice));
wg_split_barrier<<<1, size>>>(d_out, d_in);
HIP_CHECK(hipMemcpy(out.data(), d_out, sizeof(float) * size,
hipMemcpyDeviceToHost));
HIP_CHECK(hipFree(d_out));
HIP_CHECK(hipFree(d_in));
for (size_t i = 0; i < size; i++) {
INFO("Index: " << i << " in: " << in[i] << " out: " << out[i]);
REQUIRE((in[i] * 3.0f) == Catch::Approx(out[i]));
}
}
static __global__ void grid_split_barrier(int *data, int *result, int N) {
namespace cg = cooperative_groups;
cg::grid_group grid = cg::this_grid();
int gid = blockIdx.x * blockDim.x + threadIdx.x;
auto tok = grid.barrier_arrive();
if (gid < N) {
data[gid] = gid + 1;
}
grid.barrier_wait(std::move(tok));
if (grid.thread_rank() == 0) {
int sum = 0;
for (int i = 0; i < N; i++)
sum += data[i];
*result = sum;
}
}
TEST_CASE("Unit_coop_grids_split_barrier") {
hipDeviceProp_t prop;
HIP_CHECK(hipGetDeviceProperties(&prop, 0));
if (prop.cooperativeLaunch != 0) {
int N = 1024;
const int threads = 128;
const int blocks = (N + threads - 1) / threads;
int *d_in, *d_out;
HIP_CHECK(hipMalloc(&d_in, N * sizeof(int)));
HIP_CHECK(hipMalloc(&d_out, sizeof(int)));
void *args[] = {&d_in, &d_out, &N};
dim3 grid(blocks);
dim3 block(threads);
HIP_CHECK(hipLaunchCooperativeKernel((void *)grid_split_barrier, grid,
block, args, 0, 0));
HIP_CHECK(hipDeviceSynchronize());
int out = 0;
HIP_CHECK(hipMemcpy(&out, d_out, sizeof(int), hipMemcpyDeviceToHost));
HIP_CHECK(hipFree(d_in));
HIP_CHECK(hipFree(d_out));
REQUIRE(out == ((N * (N + 1)) / 2));
}
}
@@ -7,6 +7,7 @@ set(TEST_SRC
hipDrvGetErrorString.cc
hipGetLastError.cc
hipPeekAtLastError.cc
hipDynamicLogging.cc
)
if(UNIX)
@@ -0,0 +1,134 @@
/*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include <iostream>
#include <sstream>
#include <streambuf>
#include <string>
#include <fstream>
#include <fcntl.h>
#ifdef _WIN32
#include <windows.h>
#include <io.h>
#include <stdio.h>
#include <stdlib.h>
#define dup _dup
#define dup2 _dup2
#define fd_close _close
#define unlink _unlink
#define STDERR_FD _fileno(stderr)
#define OPEN_FLAGS (_O_WRONLY | _O_CREAT | _O_TRUNC)
#define OPEN_MODE (_S_IREAD | _S_IWRITE)
#define open _open
#else
#include <unistd.h>
#define fd_close close
#define STDERR_FD STDERR_FILENO
#define OPEN_FLAGS (O_WRONLY | O_CREAT | O_TRUNC)
#define OPEN_MODE 0644
#endif
// Class to capture all stderr output (HIP logging uses stderr)
class OutCapture {
private:
std::stringstream captured_stream_;
std::streambuf* cerr_backup_;
int stderr_backup_;
std::string temp_file_;
static std::string getTempFilePath() {
#ifdef _WIN32
char temp_path[MAX_PATH];
if (GetTempPathA(MAX_PATH, temp_path)) {
return std::string(temp_path) + "hip_stderr_capture.txt";
}
// Fallback to current directory
return "hip_stderr_capture.txt";
#else
return "/tmp/hip_stderr_capture.txt";
#endif
}
public:
OutCapture() : temp_file_(getTempFilePath()) {
// Backup original cerr stream buffer (HIP logging uses stderr)
cerr_backup_ = std::cerr.rdbuf();
// Backup original stderr file descriptor
stderr_backup_ = dup(STDERR_FD);
}
void startCapture() {
// Clear any previous content
captured_stream_.str("");
captured_stream_.clear();
// Redirect std::cerr to our stringstream
std::cerr.rdbuf(captured_stream_.rdbuf());
// Redirect stderr file descriptor to temp file (for fprintf to stderr)
int temp_fd = open(temp_file_.c_str(), OPEN_FLAGS, OPEN_MODE);
if (temp_fd != -1) {
dup2(temp_fd, STDERR_FD);
fd_close(temp_fd);
}
}
std::string stopCapture() {
// Restore original cerr stream
std::cerr.rdbuf(cerr_backup_);
// Restore original stderr file descriptor
dup2(stderr_backup_, STDERR_FD);
// Read from temp file (captures fprintf(stderr) output from HIP logging)
std::ifstream temp_file(temp_file_);
std::string file_content;
if (temp_file.is_open()) {
std::string line;
while (std::getline(temp_file, line)) {
file_content += line + "\n";
}
temp_file.close();
}
// Combine both captures: C++ streams and file descriptor output
std::string stream_content = captured_stream_.str();
std::string total_output = stream_content + file_content;
// Clean up temp file
unlink(temp_file_.c_str());
return total_output;
}
~OutCapture() {
// Ensure everything is restored
std::cerr.rdbuf(cerr_backup_);
dup2(stderr_backup_, STDERR_FD);
fd_close(stderr_backup_);
unlink(temp_file_.c_str());
}
};
@@ -0,0 +1,156 @@
/*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include <hip_test_common.hh>
#include "OutCapture.hh"
/**
* @addtogroup hipDynamicLogging hipDynamicLogging
* @{
* @ingroup ErrorTest
* `hipExtSetLoggingParams(size_t log_level, size_t log_size, size_t log_mask)` -
* Sets logging parameters for HIP runtime.
* `hipExtEnableLogging()` -
* Enables HIP runtime logging.
* `hipExtDisableLogging()` -
* Disables HIP runtime logging.
*/
static bool hipDynamicLoggingTest() {
// Create output capture instance
OutCapture capture;
capture.startCapture();
// Set Logging params
HIP_CHECK(hipExtSetLoggingParams(4, 0, -1));
// Logging is disabled here - allocate memory
int* dptr = nullptr;
HIP_CHECK(hipMalloc(&dptr, sizeof(int)));
// Stop capture after hipMalloc and check no output (logging disabled)
std::string malloc_output = capture.stopCapture();
if (malloc_output.size() != 0) {
INFO("Unexpected logging output during hipMalloc (logging should be disabled): " << malloc_output);
return false;
}
// Start capture before enabling logging
capture.startCapture();
// Enable logging and do memset
HIP_CHECK(hipExtEnableLogging());
HIP_CHECK(hipMemset(dptr, 0x00, sizeof(int)));
// Disable logging
HIP_CHECK(hipExtDisableLogging());
// Stop capture after disabling logging and check for output
std::string logging_output = capture.stopCapture();
if (logging_output.size() == 0) {
INFO("Expected logging output during enabled logging period, but got none");
return false;
}
// Clean up
HIP_CHECK(hipFree(dptr));
INFO("Successfully captured HIP logging output (" << logging_output.size() << " bytes)");
INFO("Logging output: " << logging_output);
return true;
}
/**
* Test Description
* ------------------------
* - Validates that HIP dynamic logging works correctly:
* 1. No output when logging is disabled
* 2. Logging output is captured when logging is enabled
* 3. hipMemset operation produces logging output during enabled period
* Test source
* ------------------------
* - unit/errorHandling/hipDynamicLogging.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.6
*/
TEST_CASE("Unit_hipDynamicLogging_Positive_Basic") {
int numDevices = 0;
HIP_CHECK(hipGetDeviceCount(&numDevices));
if (numDevices <= 0) {
HipTest::HIP_SKIP_TEST("Skipping hipDynamicLogging test - no devices available");
return;
}
REQUIRE(hipDynamicLoggingTest() == true);
}
/**
* Test Description
* ------------------------
* - Validates that hipExtSetLoggingParams sets logging parameters correctly
* and that logging can be enabled/disabled multiple times
* Test source
* ------------------------
* - unit/errorHandling/hipDynamicLogging.cc
* Test requirements
* ------------------------
* - HIP_VERSION >= 5.6
*/
TEST_CASE("Unit_hipDynamicLogging_Positive_MultipleEnableDisable") {
int numDevices = 0;
HIP_CHECK(hipGetDeviceCount(&numDevices));
if (numDevices <= 0) {
HipTest::HIP_SKIP_TEST("Skipping hipDynamicLogging test - no devices available");
return;
}
// Test multiple enable/disable cycles
OutCapture capture;
int* dptr = nullptr;
HIP_CHECK(hipMalloc(&dptr, sizeof(int)));
// Set different logging parameters
HIP_CHECK(hipExtSetLoggingParams(3, 0, -1));
for (int i = 0; i < 3; ++i) {
// Start capture and enable logging
capture.startCapture();
HIP_CHECK(hipExtEnableLogging());
HIP_CHECK(hipMemset(dptr, 0x42, sizeof(int)));
HIP_CHECK(hipExtDisableLogging());
// Check that we captured some output
std::string output = capture.stopCapture();
REQUIRE(output.size() > 0);
}
HIP_CHECK(hipFree(dptr));
}
/**
* End doxygen group ErrorTest.
* @}
*/
@@ -45,7 +45,7 @@ MATH_UNARY_HP_KERNEL_DEF(hcos);
* - HIP_VERSION >= 5.2
*/
MATH_UNARY_HP_TEST_DEF_IMPL(hcos, static_cast<float (*)(float)>(std::cos),
ULPValidatorBuilderFactory<float>(2));
ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(h2cos);
@@ -63,7 +63,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2cos);
* - HIP_VERSION >= 5.2
*/
MATH_UNARY_HP_TEST_DEF_IMPL(h2cos, static_cast<float (*)(float)>(std::cos),
ULPValidatorBuilderFactory<float>(2));
ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(hsin);
@@ -82,7 +82,7 @@ MATH_UNARY_HP_KERNEL_DEF(hsin);
* - HIP_VERSION >= 5.2
*/
MATH_UNARY_HP_TEST_DEF_IMPL(hsin, static_cast<float (*)(float)>(std::sin),
ULPValidatorBuilderFactory<float>(2));
ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(h2sin);
@@ -100,7 +100,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2sin);
* - HIP_VERSION >= 5.2
*/
MATH_UNARY_HP_TEST_DEF_IMPL(h2sin, static_cast<float (*)(float)>(std::sin),
ULPValidatorBuilderFactory<float>(2));
ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(hexp);
@@ -119,7 +119,7 @@ MATH_UNARY_HP_KERNEL_DEF(hexp);
* - HIP_VERSION >= 5.2
*/
MATH_UNARY_HP_TEST_DEF_IMPL(hexp, static_cast<float (*)(float)>(std::exp),
ULPValidatorBuilderFactory<float>(2));
ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(h2exp);
@@ -137,7 +137,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2exp);
* - HIP_VERSION >= 5.2
*/
MATH_UNARY_HP_TEST_DEF_IMPL(h2exp, static_cast<float (*)(float)>(std::exp),
ULPValidatorBuilderFactory<float>(2));
ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(hexp10);
@@ -156,7 +156,7 @@ MATH_UNARY_HP_KERNEL_DEF(hexp10);
* - HIP_VERSION >= 5.2
*/
MATH_UNARY_HP_TEST_DEF_IMPL(hexp10, static_cast<float (*)(float)>(exp10f),
ULPValidatorBuilderFactory<float>(2));
ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(h2exp10);
@@ -174,7 +174,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2exp10);
* - HIP_VERSION >= 5.2
*/
MATH_UNARY_HP_TEST_DEF_IMPL(h2exp10, static_cast<float (*)(float)>(exp10f),
ULPValidatorBuilderFactory<float>(2));
ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(hexp2);
@@ -193,7 +193,7 @@ MATH_UNARY_HP_KERNEL_DEF(hexp2);
* - HIP_VERSION >= 5.2
*/
MATH_UNARY_HP_TEST_DEF_IMPL(hexp2, static_cast<float (*)(float)>(std::exp2),
ULPValidatorBuilderFactory<float>(2));
ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(h2exp2);
@@ -211,7 +211,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2exp2);
* - HIP_VERSION >= 5.2
*/
MATH_UNARY_HP_TEST_DEF_IMPL(h2exp2, static_cast<float (*)(float)>(std::exp2),
ULPValidatorBuilderFactory<float>(2));
ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(hlog);
@@ -230,7 +230,7 @@ MATH_UNARY_HP_KERNEL_DEF(hlog);
* - HIP_VERSION >= 5.2
*/
MATH_UNARY_HP_TEST_DEF_IMPL(hlog, static_cast<float (*)(float)>(std::log),
ULPValidatorBuilderFactory<float>(1));
ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(h2log);
@@ -248,7 +248,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2log);
* - HIP_VERSION >= 5.2
*/
MATH_UNARY_HP_TEST_DEF_IMPL(h2log, static_cast<float (*)(float)>(std::log),
ULPValidatorBuilderFactory<float>(1));
ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(hlog10);
@@ -267,7 +267,7 @@ MATH_UNARY_HP_KERNEL_DEF(hlog10);
* - HIP_VERSION >= 5.2
*/
MATH_UNARY_HP_TEST_DEF_IMPL(hlog10, static_cast<float (*)(float)>(std::log10),
ULPValidatorBuilderFactory<float>(2));
ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(h2log10);
@@ -285,7 +285,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2log10);
* - HIP_VERSION >= 5.2
*/
MATH_UNARY_HP_TEST_DEF_IMPL(h2log10, static_cast<float (*)(float)>(std::log10),
ULPValidatorBuilderFactory<float>(2));
ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(hlog2);
@@ -304,7 +304,7 @@ MATH_UNARY_HP_KERNEL_DEF(hlog2);
* - HIP_VERSION >= 5.2
*/
MATH_UNARY_HP_TEST_DEF_IMPL(hlog2, static_cast<float (*)(float)>(std::log2),
ULPValidatorBuilderFactory<float>(1));
ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(h2log2);
@@ -322,7 +322,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2log2);
* - HIP_VERSION >= 5.2
*/
MATH_UNARY_HP_TEST_DEF_IMPL(h2log2, static_cast<float (*)(float)>(std::log2),
ULPValidatorBuilderFactory<float>(1));
ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(hsqrt);
@@ -341,7 +341,7 @@ MATH_UNARY_HP_KERNEL_DEF(hsqrt);
* - HIP_VERSION >= 5.2
*/
MATH_UNARY_HP_TEST_DEF_IMPL(hsqrt, static_cast<float (*)(float)>(std::sqrt),
ULPValidatorBuilderFactory<float>(1));
ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(h2sqrt);
@@ -359,7 +359,7 @@ MATH_UNARY_HP_KERNEL_DEF(h2sqrt);
* - HIP_VERSION >= 5.2
*/
MATH_UNARY_HP_TEST_DEF_IMPL(h2sqrt, static_cast<float (*)(float)>(std::sqrt),
ULPValidatorBuilderFactory<float>(1));
ULPValidatorBuilderFactory<Float16>(1));
MATH_UNARY_HP_KERNEL_DEF(hceil);
@@ -187,7 +187,9 @@ template <typename T, typename... Ts> class MathTest {
std::stringstream ss;
ss << "Input value(s): " << std::scientific
<< std::setprecision(std::numeric_limits<T>::max_digits10 - 1);
((ss << " " << args), ...) << "\n" << actual_val << " ";
((ss << " " << args), ...) << "\n"
<< "Output value: " << actual_val << "\n"
<< "Condition failed: ";
return ss.str();
}
@@ -25,6 +25,12 @@ THE SOFTWARE.
#include <catch2/catch_all.hpp>
#include <catch2/matchers/catch_matchers_floating_point.hpp>
#include <cstdint>
#include <cstring>
#include <iomanip>
#include "Float16.hh"
// Define a new MatcherBase class with a public 'describe' member function because
// Catch::MatcherBase::describe is protected and thus can't be used via a pointer to
// Catch::MatcherBase.
@@ -61,6 +67,113 @@ template <typename T, typename Matcher> class ValidatorBase : public MatcherBase
bool nan = false;
};
struct Float16WithinUlpsMatcher : MatcherBase<Float16> {
Float16WithinUlpsMatcher(Float16 target, uint64_t ulps) : m_target(target), m_ulps(ulps) {}
bool match(Float16 const& matchee) const override {
// Comparison with NaN should always be false.
// This way we can rule it out before getting into the ugly details
if (__hisnan(matchee) || __hisnan(m_target)) {
return false;
}
auto value_bits = convertFloat16toInt16(matchee);
auto target_bits = convertFloat16toInt16(m_target);
// If signs differ, handle the special +0 vs -0 case explicitly.
if ((value_bits < 0) != (target_bits < 0)) {
return matchee == m_target;
}
auto ulp_diff = std::abs(value_bits - target_bits);
return static_cast<uint64_t>(ulp_diff) <= m_ulps;
}
std::string describe() const override {
std::stringstream ret;
ret << "is within " << m_ulps << " ULPs of ";
write(ret, m_target);
ret << 'f';
ret << " ([";
write(ret, step(m_target, -FLOAT16_MAX, m_ulps));
ret << ", ";
write(ret, step(m_target, FLOAT16_MAX, m_ulps));
ret << "])";
return ret.str();
}
private:
Float16 getNextAfter(Float16 from, Float16 direction) const {
constexpr int16_t signbit_float16 = 0x8000;
// Encode inputs as 16-bit integers
const int16_t from_bits = convertFloat16toInt16(from);
const int16_t direction_bits = convertFloat16toInt16(direction);
// Special cases
if (from_bits == direction_bits) return direction_bits;
if (std::abs(from_bits) == static_cast<int16_t>(0) &&
std::abs(direction_bits) == static_cast<int16_t>(0))
return direction;
// Makes integer comparisons reflect numeric ordering across sign.
const int16_t from_ordered = (from_bits < 0) ? signbit_float16 - from_bits : from_bits;
const int16_t direction_ordered =
(direction_bits < 0) ? signbit_float16 - direction_bits : direction_bits;
// Decide whether to move up or down by one ULP
const int16_t step = (from_ordered < direction_ordered) ? 1 : -1;
// Take one step
const int16_t after_step_ordered = from_ordered + step;
// Map back from ordered space to raw Float16 bits.
int16_t next_bits =
(after_step_ordered < 0) ? signbit_float16 - after_step_ordered : after_step_ordered;
// Handle boundary behavior for the most-negative edge case.
if (from_ordered == -1 && (from_ordered < direction_ordered)) {
next_bits = signbit_float16;
}
return convertInt16toFloat16(next_bits);
}
Float16 step(Float16 start, Float16 direction, uint64_t steps) const {
Float16 result = start;
for (uint64_t i = 0; i < steps; ++i) {
result = getNextAfter(result, direction);
}
return result;
}
void write(std::ostream& out, Float16 num) const {
const uint32_t float16_max_digits = 5;
out << std::scientific << std::setprecision(float16_max_digits) << num;
}
static Float16 convertInt16toFloat16(int16_t d) {
Float16 i;
std::memcpy(&i, &d, sizeof(int16_t));
return i;
}
static int16_t convertFloat16toInt16(Float16 d) {
uint16_t i;
std::memcpy(&i, &d, sizeof(Float16));
return i;
}
Float16 m_target;
uint64_t m_ulps;
};
template <typename T> auto ULPValidatorBuilderFactory(int64_t ulps) {
return [=](T target, auto&&...) {
return std::make_unique<ValidatorBase<T, Catch::Matchers::WithinUlpsMatcher>>(
@@ -68,6 +181,13 @@ template <typename T> auto ULPValidatorBuilderFactory(int64_t ulps) {
};
};
template <> inline auto ULPValidatorBuilderFactory<Float16>(int64_t ulps) {
return [=](Float16 target, auto&&...) {
return std::make_unique<ValidatorBase<Float16, Float16WithinUlpsMatcher>>(
target, Float16WithinUlpsMatcher(target, ulps));
};
};
template <typename T> auto AbsValidatorBuilderFactory(double margin) {
return [=](T target, auto&&...) {
return std::make_unique<ValidatorBase<T, Catch::Matchers::WithinAbsMatcher>>(
@@ -96,7 +216,7 @@ template <typename T> class EqValidator : public MatcherBase<T> {
std::string describe() const override {
std::stringstream ss;
ss << " is not equal to " << target_;
ss << "is equal to " << target_;
return ss.str();
}
@@ -177,8 +177,8 @@ static bool checkhipMemPoolTrimTo(hipStream_t stream, int N, int dev = 0) {
testObj.transferFromMempool(stream);
testObj.freeDevBuf(stream);
// verify and validate
REQUIRE(true == testObj.validateResult());
HIP_CHECK(hipStreamSynchronize(stream));
REQUIRE(true == testObj.validateResult());
}
HIP_CHECK(hipMemPoolDestroy(mem_pool));
return true;
@@ -494,7 +494,6 @@ HIP doesn't support the following CUDA functions/operators in ``cooperative_grou
* ``synchronize``
* ``memcpy_async``
* ``wait`` and ``wait_prior``
* ``barrier_arrive`` and ``barrier_wait``
* ``invoke_one`` and ``invoke_one_broadcast``
* ``reduce``
* ``reduce_update_async`` and ``reduce_store_async``
@@ -9615,6 +9615,45 @@ hipError_t hipDestroySurfaceObject(hipSurfaceObject_t surfaceObject);
/**
* @}
*/
/**
* @brief Enable HIP runtime logging.
*
* This function enables the HIP runtime logging mechanism, allowing diagnostic
* and trace information to be captured during HIP API execution.
*
* @returns #hipSuccess
*
* @see hipExtDisableLogging, hipExtSetLoggingParams
*/
hipError_t hipExtEnableLogging();
/**
* @brief Disable HIP runtime logging.
*
* This function disables the HIP runtime logging mechanism, stopping the capture
* of diagnostic and trace information during HIP API execution.
*
* @returns #hipSuccess
*
* @see hipExtEnableLogging, hipExtSetLoggingParams
*/
hipError_t hipExtDisableLogging();
/**
* @brief Set HIP runtime logging parameters.
*
* This function configures the logging behavior of the HIP runtime, including
* the verbosity level, buffer size, and which components to log.
*
* @param [in] log_level The logging verbosity level. Higher values produce more detailed output.
* @param [in] log_size Reserved for future use. Currently not implemented.
* @param [in] log_mask A bitmask specifying which HIP runtime components to log.
*
* @returns #hipSuccess, #hipErrorInvalidValue
*
* @see hipExtEnableLogging, hipExtDisableLogging
*/
hipError_t hipExtSetLoggingParams(size_t log_level, size_t log_size, size_t log_mask);
#ifdef __cplusplus
} /* extern "c" */
#endif
@@ -2,7 +2,7 @@ default_stages: [pre-commit]
fail_fast: true
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
rev: v6.0.0
hooks:
- id: check-yaml
- id: end-of-file-fixer
@@ -12,7 +12,7 @@ repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version. Check https://github.com/astral-sh/ruff-pre-commit#version-compatibility
# for the latest ruff version supported by the hook.
rev: v0.12.12
rev: v0.14.11
hooks:
- id: ruff-check
args: [--fix]
@@ -49,6 +49,10 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.
* Fix issue where counter collection data was empty when profiling workload which spawn multiple child processes
* Fix issue where dispatch filtering in a range (e.g. >2) was not working
* Fix redundant warnings for compute/memory partition not found for < MI 300 series GPUs by skipping partition checks
### Removed
* Removed "VL1 Lat" metric for AMD Instinct MI300 series GPUs, due to MI300 series not supporting TCP_TCP_LATENCY_sum counter.
+7 -13
Bestand weergeven
@@ -4,14 +4,13 @@
ROCm Compute Profiler is a system performance profiling tool for machine
learning/HPC workloads running on AMD MI GPUs. The tool presently
targets usage on MI100, MI200, and MI300 accelerators.
targets usage on MI100, MI200, MI300, and MI350 series accelerators.
* For more information on available features, installation steps, and
workload profiling and analysis, please refer to the online
[documentation](https://rocm.docs.amd.com/projects/rocprofiler-compute/en/latest/).
* ROCm Compute Profiler is an AMD open source research project and is not supported
as part of the ROCm software stack. We welcome contributions and
* ROCm Compute Profiler is an AMD open source tool that is part of the ROCm software stack. We welcome contributions and
feedback from the community. Please see the
[CONTRIBUTING.md](CONTRIBUTING.md) file for additional details on our
contribution process.
@@ -39,8 +38,8 @@ python3 -m pip install -r requirements.txt
## Testing
Populate the <usename> variable in `docker/docker-compose.customrocmtest.yml`.
Populate the <rocm_build_image> variable in `docker/Dockerfile.customrocmtest` based on latest ROCm CI build information.
Populate the <username> variable in `docker/docker-compose.customrocmtest.yml`.
Populate the <tarball_name> variable in `docker/Dockerfile.customrocmtest` based on latest TheRock nightly build information.
To quickly get the environment (bash shell) for building and testing, run the following commands:
* `cd docker`
@@ -115,7 +114,7 @@ This software can be cited using a Zenodo
style reference is provided below for convenience:
```
@software{xiaomin_lu_2022_7314631
@misc{xiaomin_lu_2022_7314631
author = {Xiaomin Lu and
Cole Ramos and
Fei Zheng and
@@ -124,12 +123,7 @@ style reference is provided below for convenience:
Keith Lowery and
Nicholas Curtis and
Cristian Di Pietrantonio},
title = {ROCm/rocprofiler-compute: v3.1.0 (12 February 2025)},
month = February,
year = 2025,
publisher = {Zenodo},
version = {v3.1.0},
doi = {10.5281/zenodo.7314631},
url = {https://doi.org/10.5281/zenodo.7314631}
title = {rocprofiler-compute},
url = {https://github.com/ROCm/rocm-systems/blob/develop/projects/rocprofiler-compute}
}
```
@@ -1,32 +1,52 @@
# Use a base image
FROM <rocm_build_image>
FROM ubuntu:22.04
# Set the working directory
WORKDIR /app
# Install curl first (needed for ROCm download)
RUN apt-get update && apt-get install -y curl
# Define the tarball name as a variable
# Check https://therock-nightly-tarball.s3.amazonaws.com/index.html for latest builds
# Use therock-dist-linux-gfx<arch>-dcgpu-<rocm-version>.tar.gz naming convention
ARG TARBALL_NAME=<tarball_name>
# Install ROCm from TheRock Nightly build
RUN mkdir -p /rocm && \
curl -fLO https://therock-nightly-tarball.s3.amazonaws.com/${TARBALL_NAME} && \
tar -xf ${TARBALL_NAME} -C /rocm && \
rm ${TARBALL_NAME}
# Set environment variables for ROCm
ENV PATH="/rocm/bin:${PATH}" \
ROCM_PATH="/rocm" \
LD_LIBRARY_PATH="/rocm/lib:/rocm/lib/rocm_sysdeps/lib:${LD_LIBRARY_PATH}" \
HIP_DEVICE_LIB_PATH="/rocm/llvm/amdgcn/bitcode" \
HIP_PLATFORM=amd
# Update package list and install prerequisites
RUN apt-get update && apt-get install -y \
software-properties-common cmake locales git curl \
software-properties-common cmake locales git \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update
# Allows running git commands in /app
RUN git config --global --add safe.directory /app
# Generate the desired locale
RUN locale-gen en_US.UTF-8
# Install Python 3.10 and pip
RUN apt-get install -y python3.10 python3.10-venv python3.10-dev python3-pip libsqlite3-dev
RUN python3.10 -m venv venv
ENV PATH="venv/bin:$PATH"
RUN python3.10 -m venv /venv
ENV PATH="/venv/bin:$PATH"
RUN python -m pip install --upgrade pip
# Install any dependencies specified in requirements.txt
WORKDIR /app/projects/rocprofiler-compute
# Install any rocprofiler-compute dependencies specified in requirements.txt
COPY projects/rocprofiler-compute/requirements.txt /app/projects/rocprofiler-compute/requirements.txt
COPY projects/rocprofiler-compute/requirements-test.txt /app/projects/rocprofiler-compute/requirements-test.txt
RUN python -m pip install -r requirements.txt -r requirements-test.txt
RUN python -m pip install -r /app/projects/rocprofiler-compute/requirements.txt -r /app/projects/rocprofiler-compute/requirements-test.txt
# Set the working directory
WORKDIR /app
# Allows running git commands in /app
RUN git config --global --add safe.directory /app
# Run interactive bash shell
CMD ["/bin/bash"]
Binair bestand niet weergegeven.

Voor

Breedte:  |  Hoogte:  |  Grootte: 185 KiB

Na

Breedte:  |  Hoogte:  |  Grootte: 254 KiB

Binair bestand niet weergegeven.

Voor

Breedte:  |  Hoogte:  |  Grootte: 34 KiB

Na

Breedte:  |  Hoogte:  |  Grootte: 240 KiB

@@ -15,7 +15,7 @@ This section provides an overview of ROCm Compute Profiler's CLI analysis featur
* :ref:`Metric customization <cli-analysis-options>`: Isolate a subset of built-in metrics or build your own profiling configuration.
* :ref:`Filtering <cli-analysis-options>`: Hone in on a particular kernel, GPU ID, or dispatch ID via post-process filtering.
* :ref:`Per-kernel roofline analysis <per-kernel-roofline>`: Detailed arithmetic intensity and performance analysis for individual kernels.
Run ``rocprof-compute analyze -h`` for more details.
@@ -346,6 +346,7 @@ Show System Speed-of-Light and CS_Busy blocks only
this case, ``1`` is the ID for System Speed-of-Light and ``5.1.0`` the ID for
GPU Busy Cycles metric.
Filter kernels
First, list the top kernels in your application using `--list-stats`.
@@ -534,36 +535,40 @@ Analysis database example
.. code-block:: shell-session
$ rocprof-compute analyze --verbose --db test -p workloads/vmem/MI300X_A1 -p workloads/vmem1/MI300X_A1
$ rocprof-compute analyze --verbose --output-name test --output-format db -p workloads/nbody/MI300X_A1 -p workloads/nbody1/MI300X_A1
DEBUG Execution mode = analyze
__ _
_ __ ___ ___ _ __ _ __ ___ / _| ___ ___ _ __ ___ _ __ _ _| |_ ___
| '__/ _ \ / __| '_ \| '__/ _ \| |_ _____ / __/ _ \| '_ ` _ \| '_ \| | | | __/ _ \
| | | (_) | (__| |_) | | | (_) | _|_____| (_| (_) | | | | | | |_) | |_| | || __/
|_| \___/ \___| .__/|_| \___/|_| \___\___/|_| |_| |_| .__/ \__,_|\__\___|
|_| |_|
__ _
_ __ ___ ___ _ __ _ __ ___ / _| ___ ___ _ __ ___ _ __ _ _| |_ ___
| '__/ _ \ / __| '_ \| '__/ _ \| |_ _____ / __/ _ \| '_ ` _ \| '_ \| | | | __/ _ \
| | | (_) | (__| |_) | | | (_) | _|_____| (_| (_) | | | | | | |_) | |_| | || __/
|_| \___/ \___| .__/|_| \___/|_| \___\___/|_| |_| |_| .__/ \__,_|\__\___|
|_| |_|
INFO Analysis mode = db
DEBUG [omnisoc init]
DEBUG [omnisoc init]
DEBUG [analysis] prepping to do some analysis
INFO [analysis] deriving rocprofiler-compute metrics...
WARNING Roofline ceilings not found for /app/projects/rocprofiler-compute/workloads/vmem/MI300X_A1.
WARNING Roofline ceilings not found for /app/projects/rocprofiler-compute/workloads/vmem1/MI300X_A1.
WARNING PC sampling data not found for /app/projects/rocprofiler-compute/workloads/vmem/MI300X_A1.
WARNING PC sampling data not found for /app/projects/rocprofiler-compute/workloads/vmem1/MI300X_A1.
DEBUG Collected dispatch data
DEBUG Applied analysis mode filters
DEBUG Calculated dispatch data
DEBUG Collected metrics data
WARNING Failed to evaluate expression for 3.1.39 - Value: to_round((to_avg(
INFO Analysis mode = db
INFO ed45b0b189
DEBUG [omnisoc init]
INFO ed45b0b189
DEBUG [omnisoc init]
DEBUG [analysis] prepping to do some analysis
INFO [analysis] deriving rocprofiler-compute metrics...
DEBUG Collected roofline ceilings
WARNING PC sampling data not found for /app/projects/rocprofiler-compute/workloads/nbody/MI300X_A1.
WARNING PC sampling data not found for /app/projects/rocprofiler-compute/workloads/nbody1/MI300X_A1.
DEBUG Collected dispatch data
DEBUG Applied analysis mode filters
DEBUG Calculated dispatch data
DEBUG Collected metrics data
WARNING Failed to evaluate expression for 3.1.39 - Value: to_round((to_avg(
(pmc_df.get("pmc_perf_ACCUM") / pmc_df.get("SQC_ICACHE_REQ")).where((pmc_df.get("SQC_ICACHE_REQ") != 0), None)) * 100), 0) - unsupported operand type(s) for /: 'NoneType' and 'float'
WARNING Failed to evaluate expression for 3.1.39 - Value: to_round((to_avg(
WARNING Failed to evaluate expression for 3.1.39 - Value: to_round((to_avg(
(pmc_df.get("pmc_perf_ACCUM") / pmc_df.get("SQC_ICACHE_REQ")).where((pmc_df.get("SQC_ICACHE_REQ") != 0), None)) * 100), 0) - unsupported operand type(s) for /: 'NoneType' and 'float'
DEBUG Calculated metric values
DEBUG Calculated roofline data points
DEBUG [analysis] generating analysis
DEBUG SQLite database initialized with name: test.db
DEBUG Initialized database: test.db
DEBUG Completed writing database
DEBUG Calculated metric values
DEBUG Calculated roofline data points
DEBUG [analysis] generating analysis
DEBUG SQLite database initialized with name: test.db
DEBUG Initialized database: test.db
INFO ed45b0b189
INFO ed45b0b189
DEBUG Completed writing database
WARNING Created file: test.db
@@ -28,7 +28,9 @@ Launch the standalone GUI analyzer
----------------------------------
To launch the ROCm Compute Profiler GUI analyzer, include the ``--gui`` flag with your
desired analysis command. For example:
desired analysis command.
For example:
.. code-block:: shell-session
@@ -386,26 +386,24 @@ class OmniAnalyze_Base:
sys.exit(0)
# Ensure analysis output does not overwrite existing files
if not args.output_name:
return
if args.output_name:
if not re.match(r"^[A-Za-z0-9_-]+$", args.output_name):
console_error(
"analysis",
"Analysis output file/folder name must "
"contain only alphanumeric characters "
"or underscores (_), hyphens (-).",
)
if not re.match(r"^[A-Za-z0-9_-]+$", args.output_name):
console_error(
"analysis",
"Analysis output file/folder name must "
"contain only alphanumeric characters "
"or underscores (_), hyphens (-).",
)
path_to_check = args.output_name
if args.output_format in ("txt", "db"):
path_to_check += f".{args.output_format}"
path_to_check = args.output_name
if args.output_format in ("txt", "db"):
path_to_check += f".{args.output_format}"
if Path(path_to_check).exists():
console_error(
f"Analysis output file/folder {path_to_check} already exists. "
"Please choose a different name."
)
if Path(path_to_check).exists():
console_error(
f"Analysis output file/folder {path_to_check} already exists. "
"Please choose a different name."
)
# Check if any kernel's counters are missing due to iteration multiplexing
if (
@@ -101,7 +101,9 @@ class db_analysis(OmniAnalyze_Base):
Database.init(db_name)
console_debug(f"Initialized database: {db_name}")
# Iterate over all workloads
for workload_path in self._runs.keys():
# Add workload
workload_obj = orm.Workload(
name=workload_path.split("/")[-2],
sub_name=workload_path.split("/")[-1],
@@ -113,38 +115,9 @@ class db_analysis(OmniAnalyze_Base):
)
Database.get_session().add(workload_obj)
for pc_sample in self._pc_sampling_data_per_workload.get(
workload_path, pd.DataFrame()
).itertuples():
Database.get_session().add(
orm.PCsampling(
source=pc_sample.source_line,
instruction=pc_sample.instruction,
count=pc_sample.count,
kernel_name=pc_sample.kernel_name,
offset=pc_sample.offset,
count_issue=pc_sample.count_issued,
count_stall=pc_sample.count_stalled,
stall_reason=pc_sample.stall_reason,
workload=workload_obj,
)
)
for roofline_data in self._roofline_data_per_workload.get(
workload_path, pd.DataFrame()
).itertuples():
Database.get_session().add(
orm.RooflineData(
kernel_name=roofline_data.kernel_name,
total_flops=roofline_data.total_flops,
l1_cache_data=roofline_data.l1_cache_data,
l2_cache_data=roofline_data.l2_cache_data,
hbm_cache_data=roofline_data.hbm_cache_data,
workload=workload_obj,
)
)
# Add kernel
kernel_objs: dict[str, orm.Kernel] = {}
for dispatch in self._dispatch_data_per_workload.get(
workload_path, pd.DataFrame()
).itertuples():
@@ -167,44 +140,101 @@ class db_analysis(OmniAnalyze_Base):
)
)
# Optimize: Pre-group values by (metric_id, kernel_name) for O(1) lookups
values_df = self._values_data_per_workload.get(
workload_path, pd.DataFrame()
)
values_grouped = {}
if not values_df.empty:
for value in values_df.itertuples():
key = (value.metric_id, value.kernel_name)
if key not in values_grouped:
values_grouped[key] = []
values_grouped[key].append(value)
for metric in self._metrics_info_data_per_workload.get(
# Add roofline data points
for roofline_data in self._roofline_data_per_workload.get(
workload_path, pd.DataFrame()
).itertuples():
for kernel_name in kernel_objs.keys():
metric_obj = orm.Metric(
name=metric.name,
metric_id=metric.metric_id,
description=metric.description,
unit=metric.unit,
table_name=metric.table_name,
sub_table_name=metric.sub_table_name,
kernel=kernel_objs[kernel_name],
if roofline_data.kernel_name not in kernel_objs:
console_warning(
f"Kernel {roofline_data.kernel_name} from roofline data "
"not found in dispatch data. Skipping roofline entry."
)
Database.get_session().add(metric_obj)
continue
Database.get_session().add(
orm.RooflineData(
total_flops=roofline_data.total_flops,
l1_cache_data=roofline_data.l1_cache_data,
l2_cache_data=roofline_data.l2_cache_data,
hbm_cache_data=roofline_data.hbm_cache_data,
kernel=kernel_objs[roofline_data.kernel_name],
)
)
# Direct lookup instead of iterating through all values
key = (metric.metric_id, kernel_name)
for value in values_grouped.get(key, []):
Database.get_session().add(
orm.Value(
metric=metric_obj,
value_name=value.value_name,
value=value.value,
)
# Add pc sampling data
for pc_sample in self._pc_sampling_data_per_workload.get(
workload_path, pd.DataFrame()
).itertuples():
if pc_sample.kernel_name not in kernel_objs:
console_warning(
f"Kernel {pc_sample.kernel_name} from PC sampling data "
"not found in dispatch data. Skipping PC sampling entry."
)
continue
Database.get_session().add(
orm.PCsampling(
source=pc_sample.source_line,
instruction=pc_sample.instruction,
count=pc_sample.count,
offset=pc_sample.offset,
count_issue=pc_sample.count_issued,
count_stall=pc_sample.count_stalled,
stall_reason=pc_sample.stall_reason,
kernel=kernel_objs[pc_sample.kernel_name],
)
)
# Add metrics and values - iterate on values, create metrics as needed
metrics_info_dict = {
row.metric_id: row
for row in self._metrics_info_data_per_workload.get(
workload_path, pd.DataFrame()
).itertuples()
}
metric_objs: dict[str, orm.MetricDefinition] = {}
for value in self._values_data_per_workload.get(
workload_path, pd.DataFrame()
).itertuples():
# Check if kernel exists
if value.kernel_name not in kernel_objs:
console_warning(
f"Kernel {value.kernel_name} from values data "
"not found in dispatch data. Skipping metric value."
)
continue
# Create or reuse metric object
if value.metric_id not in metric_objs:
# Fetch metric info
if value.metric_id not in metrics_info_dict:
console_warning(
f"Metric {value.metric_id} from values data "
"not found in metrics info. Skipping metric value."
)
continue
metric_info = metrics_info_dict[value.metric_id]
metric_objs[value.metric_id] = orm.MetricDefinition(
name=metric_info.name,
metric_id=metric_info.metric_id,
description=metric_info.description,
unit=metric_info.unit,
table_name=metric_info.table_name,
sub_table_name=metric_info.sub_table_name,
workload=workload_obj,
)
Database.get_session().add(metric_objs[value.metric_id])
# Add value
Database.get_session().add(
orm.MetricValue(
metric=metric_objs[value.metric_id],
kernel=kernel_objs[value.kernel_name],
value_name=value.value_name,
value=value.value,
)
)
# Add metadata
version = get_version(rocprof_compute_home)
Database.get_session().add(
orm.Metadata(
@@ -51,7 +51,6 @@ class webui_analysis(OmniAnalyze_Base):
self.app = dash.Dash(
__name__, title=PROJECT_NAME, external_stylesheets=[dbc.themes.CYBORG]
)
self.dest_dir = str(Path(args.path[0][0]).absolute().resolve())
self.arch: Optional[str] = None
self.__hidden_sections = ["Memory Chart"]
@@ -90,6 +89,7 @@ class webui_analysis(OmniAnalyze_Base):
kernel_top_df = base_data.dfs[1]
for kernel_id in base_data.filter_kernel_ids:
filt_kernel_names.append(str(kernel_top_df.loc[kernel_id, "Kernel_Name"]))
input_filters["kernel"] = filt_kernel_names
# setup app layout
from utils.gui_components.header import get_header
@@ -338,6 +338,7 @@ class webui_analysis(OmniAnalyze_Base):
)
args = self.get_args()
self.dest_dir = str(Path(args.path[0][0]).absolute().resolve())
# create 'mega dataframe'
self._runs[self.dest_dir].raw_pmc = file_io.create_df_pmc(
@@ -531,12 +531,15 @@ class RocProfCompute_Base:
and not args.attach_pid
):
# Use native counter collection tool
# Use lib* glob pattern to handle CMAKE_INSTALL_LIBDIR variations
# (lib, lib64, lib32, etc. depending on distribution)
native_tool_base_path = Path(sys.argv[0]).resolve().parents[2]
native_tool_glob_pattern = (
"lib*/rocprofiler-compute/librocprofiler-compute-tool.so"
)
try:
native_tool_path = str(
Path(sys.argv[0]).resolve().parents[2]
/ "lib"
/ "rocprofiler-compute"
/ "librocprofiler-compute-tool.so"
next(native_tool_base_path.glob(native_tool_glob_pattern))
)
except Exception as e:
console_debug(
@@ -552,6 +555,7 @@ class RocProfCompute_Base:
)
/ "librocprofiler-compute-tool.so"
)
native_tool_cpp_path = Path(__file__).resolve().parents[1] / "lib"
link_libraries = ("rocprofiler-sdk",)
build_command = (
# Create shared object
@@ -564,10 +568,10 @@ class RocProfCompute_Base:
# rocprofiler sdk library path
f"-L {str(Path(args.rocprofiler_sdk_tool_path).parent.parent)} "
# native tool source files (tool.cpp and helper.cpp)
f"{str(Path(__file__).parent.parent)}/"
"lib/rocprofiler_compute_tool.cpp "
f"{str(Path(__file__).parent.parent)}/"
"lib/helper.cpp "
f"{native_tool_cpp_path}/"
"rocprofiler_compute_tool.cpp "
f"{native_tool_cpp_path}/"
"helper.cpp "
# temporary shared object for native tool
f"-o {native_tool_path}"
)
@@ -575,7 +579,15 @@ class RocProfCompute_Base:
success, output = capture_subprocess_output(shlex.split(build_command))
console_debug(f"Build output: {output}")
if not success:
console_error("Failed to build native counter collection tool.")
console_error(
"Failed to use native counter collection tool.\n"
"Could not find pre-built .so file at: "
f"{native_tool_base_path / native_tool_glob_pattern}\n"
"Could not find source .cpp files in folder: "
f"{native_tool_cpp_path}\n"
"Please ensure the native tool library is installed "
"or source files are present."
)
if self.__profiler == "rocprofiler-sdk":
options = self.get_profiler_options(native_tool_path=native_tool_path)
@@ -92,7 +92,6 @@ class rocprofiler_sdk_profiler(RocProfCompute_Base):
/ "librocprofiler-sdk-rocattach.so"
)
options.update({
"ROCPROF_ATTACH_TOOL_LIBRARY": rocprofiler_sdk_tool_path,
"ROCPROF_ATTACH_LIBRARY": rocprofiler_attach_library_path,
"ROCPROF_ATTACH_PID": args.attach_pid,
})
@@ -45,7 +45,7 @@ from sqlalchemy.sql import Select
from utils.logger import console_debug, console_error
PREFIX = "compute_"
SCHEMA_VERSION = "1.1.0"
SCHEMA_VERSION = "1.2.0"
Base = declarative_base()
@@ -63,18 +63,16 @@ class Workload(Base):
# Workload can have multiple kernels
kernels = relationship("Kernel", back_populates="workload")
# Workload can have multiple roofline data points
roofline_data_points = relationship("RooflineData", back_populates="workload")
# Workload can have multiple pc_sampling values
pc_sampling_values = relationship("PCsampling", back_populates="workload")
# Workload can have multiple metric definitions
metric_definitions = relationship("MetricDefinition", back_populates="workload")
class Metric(Base):
__tablename__ = f"{PREFIX}metric"
class MetricDefinition(Base):
__tablename__ = f"{PREFIX}metric_definition"
metric_uuid = Column(Integer, primary_key=True)
kernel_uuid = Column(
Integer, ForeignKey(f"{PREFIX}kernel.kernel_uuid"), nullable=False
workload_id = Column(
Integer, ForeignKey(f"{PREFIX}workload.workload_id"), nullable=False
)
name = Column(String) # e.g. Wavefronts Num
metric_id = Column(String) # e.g. 4.1.3
@@ -83,27 +81,26 @@ class Metric(Base):
sub_table_name = Column(String) # e.g. Wavefront stats
unit = Column(String) # e.g. Gbps
# Metric can have one kernel
kernel = relationship("Kernel", back_populates="metrics")
# Metric can have multiple values
values = relationship("Value", back_populates="metric")
# Metric can have one workload
workload = relationship("Workload", back_populates="metric_definitions")
# Metric can have multiple metric values
metric_values = relationship("MetricValue", back_populates="metric")
class RooflineData(Base):
__tablename__ = f"{PREFIX}roofline_data"
roofline_uuid = Column(Integer, primary_key=True)
workload_id = Column(
Integer, ForeignKey(f"{PREFIX}workload.workload_id"), nullable=False
kernel_uuid = Column(
Integer, ForeignKey(f"{PREFIX}kernel.kernel_uuid"), nullable=False
)
kernel_name = Column(String)
total_flops = Column(Float)
l1_cache_data = Column(Float)
l2_cache_data = Column(Float)
hbm_cache_data = Column(Float)
# Roofline data point can have one workload
workload = relationship("Workload", back_populates="roofline_data_points")
# Roofline data point can have one kernel
kernel = relationship("Kernel", back_populates="roofline_data_points")
class Dispatch(Base):
@@ -135,42 +132,50 @@ class Kernel(Base):
workload = relationship("Workload", back_populates="kernels")
# Kernel can have multiple dispatches
dispatches = relationship("Dispatch", back_populates="kernel")
# Kernel can have multiple metrics
metrics = relationship("Metric", back_populates="kernel")
# Kernel can have multiple metric values
metric_values = relationship("MetricValue", back_populates="kernel")
# Kernel can have multiple roofline data points
roofline_data_points = relationship("RooflineData", back_populates="kernel")
# Kernel can have multiple pc_sampling values
pc_sampling_values = relationship("PCsampling", back_populates="kernel")
class PCsampling(Base):
__tablename__ = f"{PREFIX}pcsampling"
pc_sampling_uuid = Column(Integer, primary_key=True)
workload_id = Column(
Integer, ForeignKey(f"{PREFIX}workload.workload_id"), nullable=False
kernel_uuid = Column(
Integer, ForeignKey(f"{PREFIX}kernel.kernel_uuid"), nullable=False
)
source = Column(String)
instruction = Column(String)
count = Column(Integer)
kernel_name = Column(String)
offset = Column(Integer)
count_issue = Column(Integer)
count_stall = Column(Integer)
stall_reason = Column(JSON)
# PCsampling can have one workload
workload = relationship("Workload", back_populates="pc_sampling_values")
# PCsampling can have one kernel
kernel = relationship("Kernel", back_populates="pc_sampling_values")
class Value(Base):
__tablename__ = f"{PREFIX}value"
class MetricValue(Base):
__tablename__ = f"{PREFIX}metric_value"
value_uuid = Column(Integer, primary_key=True)
metric_uuid = Column(
Integer, ForeignKey(f"{PREFIX}metric.metric_uuid"), nullable=False
Integer, ForeignKey(f"{PREFIX}metric_definition.metric_uuid"), nullable=False
)
kernel_uuid = Column(
Integer, ForeignKey(f"{PREFIX}kernel.kernel_uuid"), nullable=False
)
value_name = Column(String) # e.g. min, max, avg
value = Column(Float) # e.g. 123.45
# Value can have one metric
metric = relationship("Metric", back_populates="values")
metric = relationship("MetricDefinition", back_populates="metric_values")
# Value can have one kernel
kernel = relationship("Kernel", back_populates="metric_values")
class Metadata(Base):
@@ -250,11 +255,20 @@ def get_views() -> list[TextClause]:
views: dict[str, Select[Any]] = {
"kernel_view": select(
Kernel.kernel_uuid.label("kernel_uuid"),
Kernel.workload_id.label("workload_id"),
Workload.name.label("workload_name"),
Kernel.kernel_name,
func.count(Dispatch.dispatch_id).label("dispatch_count"),
func.sum(Dispatch.end_timestamp - Dispatch.start_timestamp).label(
"duration_ns_sum"
),
func.min(Dispatch.end_timestamp - Dispatch.start_timestamp).label(
"duration_ns_min"
),
func.max(Dispatch.end_timestamp - Dispatch.start_timestamp).label(
"duration_ns_max"
),
median_calc.c.duration_ns_median,
func.avg(Dispatch.end_timestamp - Dispatch.start_timestamp).label(
"duration_ns_mean"
@@ -262,24 +276,31 @@ def get_views() -> list[TextClause]:
)
.select_from(Dispatch)
.join(Kernel, Dispatch.kernel_uuid == Kernel.kernel_uuid)
.join(Workload, Kernel.workload_id == Workload.workload_id)
.join(median_calc.subquery(), Kernel.kernel_name == median_calc.c.kernel_name)
.group_by(Kernel.kernel_name),
.group_by(
Kernel.kernel_uuid, Kernel.workload_id, Workload.name, Kernel.kernel_name
),
"metric_view": select(
Workload.workload_id.label("workload_id"),
Workload.name.label("workload_name"),
Kernel.kernel_uuid.label("kernel_uuid"),
Kernel.kernel_name,
Metric.name.label("metric_name"),
Metric.metric_id,
Metric.description,
Metric.table_name,
Metric.sub_table_name,
Metric.unit,
Value.value_name,
Value.value,
MetricDefinition.metric_uuid.label("metric_uuid"),
MetricDefinition.name.label("metric_name"),
MetricDefinition.metric_id,
MetricDefinition.description,
MetricDefinition.table_name,
MetricDefinition.sub_table_name,
MetricDefinition.unit,
MetricValue.value_uuid.label("value_uuid"),
MetricValue.value_name,
MetricValue.value,
)
.select_from(Metric)
.join(Kernel, Metric.kernel_uuid == Kernel.kernel_uuid)
.join(Value, Metric.metric_uuid == Value.metric_uuid)
.join(Workload, Kernel.workload_id == Workload.workload_id),
.select_from(MetricDefinition)
.join(Workload, MetricDefinition.workload_id == Workload.workload_id)
.join(MetricValue, MetricDefinition.metric_uuid == MetricValue.metric_uuid)
.join(Kernel, MetricValue.kernel_uuid == Kernel.kernel_uuid),
}
return [
@@ -61,7 +61,7 @@ def multi_bar_chart(
def create_instruction_mix_bar_chart(display_df: pd.DataFrame, df_unit: str) -> px.bar:
display_df = display_df.copy()
display_df["Avg"] = display_df["Avg"].apply(lambda x: int(x) if x != "" else 0)
display_df["Avg"] = display_df["Avg"].apply(lambda x: int(x) if x != "N/A" else 0)
return px.bar(
display_df,
@@ -78,7 +78,7 @@ def create_multi_bar_charts(
display_df: pd.DataFrame, table_id: int, df_unit: str
) -> list[px.bar]:
display_df = display_df.copy()
display_df["Avg"] = display_df["Avg"].apply(lambda x: int(x) if x != "" else 0)
display_df["Avg"] = display_df["Avg"].apply(lambda x: int(x) if x != "N/A" else 0)
nested_bar = multi_bar_chart(table_id, display_df)
charts = []
@@ -103,7 +103,9 @@ def create_multi_bar_charts(
def create_sol_charts(display_df: pd.DataFrame, table_id: int) -> list[px.bar]:
display_df = display_df.copy()
display_df["Avg"] = display_df["Avg"].apply(lambda x: float(x) if x != "" else 0.0)
display_df["Avg"] = display_df["Avg"].apply(
lambda x: float(x) if x != "N/A" else 0.0
)
charts = []
@@ -144,7 +146,7 @@ def create_sol_charts(display_df: pd.DataFrame, table_id: int) -> list[px.bar]:
elif table_id == 1101:
# Special formatting reference 'Pct of Peak' value
display_df["Pct of Peak"] = display_df["Pct of Peak"].apply(
lambda x: float(x) if x != "" else 0.0
lambda x: float(x) if x != "N/A" else 0.0
)
charts.append(
px.bar(
@@ -1290,6 +1290,8 @@ def apply_dispatch_filter(df: pd.DataFrame, workload: schema.Workload) -> pd.Dat
# NB: support ignoring the 1st n dispatched execution by '> n'
# The better way may be parsing python slice string
for dispatch_id in workload.filter_dispatch_ids:
if isinstance(dispatch_id, str) and ">" in dispatch_id:
dispatch_id = re.match(r"\>\s*(\d+)", dispatch_id).group(1)
if int(dispatch_id) >= len(df): # subtract 2 bc of the two header rows
console_error("analysis", f"{dispatch_id} is an invalid dispatch id.")
@@ -1297,7 +1299,7 @@ def apply_dispatch_filter(df: pd.DataFrame, workload: schema.Workload) -> pd.Dat
isinstance(workload.filter_dispatch_ids[0], str)
and ">" in workload.filter_dispatch_ids[0]
):
dispatch_match = re.match(r"\> (\d+)", workload.filter_dispatch_ids[0])
dispatch_match = re.match(r"\>\s*(\d+)", workload.filter_dispatch_ids[0])
df = df[
df[schema.PMC_PERF_FILE_PREFIX]["Dispatch_ID"]
> int(dispatch_match.group(1))
@@ -174,15 +174,15 @@ def generate_machine_specs(
##########################################
machine_info = extract_machine_info()
# FIXME: use device
# Load amd-smi data
gpu_info = extract_gpu_info()
##########################################
## B. SoC Specs
##########################################
soc_info = extract_soc_info()
# FIXME: use device
# Load amd-smi data
gpu_info = extract_gpu_info(gpu_arch=soc_info["gpu_arch"])
# Combine all specifications
with amdsmi_ctx():
specs = MachineSpecs(
@@ -269,7 +269,16 @@ def extract_machine_info() -> dict[str, Any]:
@demarcate
def extract_gpu_info() -> dict[str, Any]:
def extract_gpu_info(gpu_arch: Optional[str]) -> dict[str, Any]:
# Partition is only supported on >= MI 300 series
# (gpu_arch should be gfx940 or higher for MI300+)
is_partition_supported = False
if gpu_arch and gpu_arch.startswith("gfx") and len(gpu_arch) >= 6:
try:
is_partition_supported = int(gpu_arch[3:6], 16) >= 0x940
except ValueError:
pass # Invalid hex string, keep is_partition_supported as False
result: dict[str, Optional[str]] = {
"vbios": None,
"compute_partition": None,
@@ -278,17 +287,22 @@ def extract_gpu_info() -> dict[str, Any]:
with amdsmi_ctx():
result["vbios"] = get_gpu_vbios_part_number()
result["compute_partition"] = get_gpu_compute_partition()
result["memory_partition"] = get_gpu_memory_partition()
if is_partition_supported:
result["compute_partition"] = get_gpu_compute_partition()
result["memory_partition"] = get_gpu_memory_partition()
else:
result["compute_partition"] = "N/A"
result["memory_partition"] = "N/A"
# Apply defaults and warnings
if result["compute_partition"] == "N/A" or not result["compute_partition"]:
console_warning("Cannot detect accelerator partition from amd-smi.")
console_warning("Applying default accelerator partition: SPX")
result["compute_partition"] = "SPX"
if is_partition_supported:
if result["compute_partition"] == "N/A" or not result["compute_partition"]:
console_warning("Cannot detect accelerator partition from amd-smi.")
console_warning("Applying default accelerator partition: SPX")
result["compute_partition"] = "SPX"
if result["memory_partition"] == "N/A" or not result["memory_partition"]:
console_warning("Cannot detect memory partition from amd-smi.")
if result["memory_partition"] == "N/A" or not result["memory_partition"]:
console_warning("Cannot detect memory partition from amd-smi.")
console_debug(
f"vbios is {result['vbios']}, compute partition is "
@@ -235,6 +235,101 @@ def detect_rocprof(args: argparse.Namespace) -> str:
return rocprof_cmd
def perform_attach_detach(new_env: dict[str, str], options: dict[str, Any]) -> None:
@contextmanager
def temporary_env(env_vars: dict[str, str]) -> Generator[None, None, None]:
"""
Temporarily change the environment variable of this application.
"""
original_env = os.environ.copy()
os.environ.update({k: str(v) for k, v in env_vars.items()})
try:
yield
finally:
os.environ.clear()
os.environ.update(original_env)
with temporary_env(new_env):
libname = options["ROCPROF_ATTACH_LIBRARY"]
try:
c_lib = ctypes.CDLL(libname)
if c_lib is None:
console_error(f"Error opening {libname}")
except Exception as e:
console_error(f"Error loading {libname}: {e}")
# Set argument and return types for attach/detach functions
try:
# old attach/detach API
c_lib.attach.argtypes = [ctypes.c_uint]
except Exception as e:
console_debug(
"Error setting old attach/detach API argument "
f"types: {e}, trying new API"
)
try:
# new attach/detach API
c_lib.rocattach_attach.restype = ctypes.c_int
c_lib.rocattach_attach.argtypes = [ctypes.c_int]
c_lib.rocattach_detach.restype = ctypes.c_int
c_lib.rocattach_detach.argtypes = [ctypes.c_int]
except Exception as e:
console_error(
f"Error setting attach/detach function argument types: {e}"
)
pid = options["ROCPROF_ATTACH_PID"]
if pid is None:
console_error("Mode of attach/detach must have setup for process ID")
try:
# old attach/detach API
c_lib.attach(int(pid))
except Exception as e:
console_debug(f"Error attaching with old API: {e}, trying new API")
try:
# new attach/detach API
attach_status = c_lib.rocattach_attach(int(pid))
if attach_status != 0:
console_error(
f"Error attaching to process {pid}, "
f"rocattach_attach returned {attach_status}"
)
except Exception as e:
console_error(f"Error attaching to process {pid}: {e}")
duration = os.environ.get("ROCPROF_ATTACH_DURATION", None)
if duration is None:
console_log(
f"\033[93mAttach to process with ID {pid} is successful, "
"Press Enter to detach...\033[0m"
)
input()
else:
console_log(
f"\033[93mAttach to process with ID {pid} is successful, "
f"detach will happen in {duration} milliseconds...\033[0m"
)
time.sleep(int(duration) / 1000)
try:
# old attach/detach API
c_lib.detach(int(pid))
except Exception as e:
console_debug(f"Error detaching with old API: {e}, trying new API")
try:
# new attach/detach API
detach_status = c_lib.rocattach_detach(int(pid))
if detach_status != 0:
console_error(
f"Error detaching from process {pid}, "
f"rocattach_detach returned {detach_status}"
)
except Exception as e:
console_error(f"Error detaching from process {pid}: {e}")
def capture_subprocess_output(
subprocess_args: list[str],
new_env: Optional[dict[str, str]] = None,
@@ -788,49 +883,7 @@ def run_prof(
console_debug(f"rocprof sdk env vars: {new_env}")
if is_mode_live_attach:
@contextmanager
def temporary_env(env_vars: dict[str, str]) -> Generator[None, None, None]:
"""
Temporarily change the environment variable of this application.
"""
original_env = os.environ.copy()
os.environ.update({k: str(v) for k, v in env_vars.items()})
try:
yield
finally:
os.environ.clear()
os.environ.update(original_env)
with temporary_env(new_env):
libname = options["ROCPROF_ATTACH_LIBRARY"]
c_lib = ctypes.CDLL(libname)
if c_lib is None:
console_error(f"Error opening {libname}")
c_lib.attach.argtypes = [ctypes.c_uint]
pid = options["ROCPROF_ATTACH_PID"]
if pid is None:
console_error(
"Mode of attach/detach must have setup for process ID"
)
c_lib.attach(int(pid))
duration = os.environ.get("ROCPROF_ATTACH_DURATION", None)
if duration is None:
console_log(
f"\033[93mAttach to process with ID {pid} is successful, "
"Press Enter to detach...\033[0m"
)
input()
else:
console_log(
f"\033[93mAttach to process with ID {pid} is successful, "
f"detach will happen in {duration} milliseconds...\033[0m"
)
time.sleep(int(duration) / 1000)
c_lib.detach(int(pid))
perform_attach_detach(new_env, options)
else:
if app_cmd is None:
console_error(
@@ -29,6 +29,7 @@ import re
import sqlite3
import subprocess
import sys
import time
from pathlib import Path
import numpy as np
@@ -989,19 +990,19 @@ def test_analyze_rocpd(
Dispatch,
Kernel,
Metadata,
Metric,
MetricDefinition,
MetricValue,
RooflineData,
Value,
Workload,
)
table_name_map = {
"compute_workload": Workload,
"compute_metric": Metric,
"compute_metric_definition": MetricDefinition,
"compute_roofline_data": RooflineData,
"compute_dispatch": Dispatch,
"compute_kernel": Kernel,
"compute_value": Value,
"compute_metric_value": MetricValue,
"compute_metadata": Metadata,
}
@@ -2268,6 +2269,7 @@ def test_live_attach_detach_block(binary_handler_profile_rocprof_compute):
try:
# Start workload
process_workload = subprocess.Popen(config["app_hip_dynamic_shared"], env=env)
time.sleep(5) # Give workload time to start
attach_detach = {
"attach_pid": process_workload.pid,
@@ -2316,8 +2318,9 @@ def test_live_attach_detach_block_thread_sleep(binary_handler_profile_rocprof_co
try:
# Start workload with sleep mode enabled
process_workload = subprocess.Popen(
[config["app_hip_dynamic_shared"], "--enable-sleep"], env=env
[*config["app_hip_dynamic_shared"], "--enable-sleep"], env=env
)
time.sleep(5) # Give workload time to start
attach_detach = {
"attach_pid": process_workload.pid,
@@ -2358,7 +2361,7 @@ def test_live_attach_detach_block_thread_sleep(binary_handler_profile_rocprof_co
@pytest.mark.live_attach_detach
def test_live_attach_detach_singlepath_launch_stats(
def test_live_attach_detach_singlepass_launch_stats(
binary_handler_profile_rocprof_compute,
):
options = ["--set", "launch_stats"]
@@ -2374,6 +2377,7 @@ def test_live_attach_detach_singlepath_launch_stats(
try:
# Start workload
process_workload = subprocess.Popen(config["app_hip_dynamic_shared"], env=env)
time.sleep(5) # Give workload time to start
attach_detach = {
"attach_pid": process_workload.pid,
+14 -1
Bestand weergeven
@@ -242,6 +242,19 @@ Full documentation for ROCprofiler-SDK is available at [rocm.docs.amd.com/projec
## ROCprofiler-SDK 1.1.0 for ROCm release 7.2
### Added
- Strix halo support for counter collection.
- Counter collection support for `gfx1150` and `gfx1151` (Strix Halo).
- HSA Extension API v8 support.
- `hipStreamCopyAttributes` API implementation.
### Optimized
- Improved process attachment and updated the corresponding [documentation](https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/how-to/using-rocprofv3-process-attachment.html).
- Improved [Quick reference guide for rocprofv3] (https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/quick_guide.html).
- Updated installation documentation with links to the latest repository (https://rocm.docs.amd.com/projects/rocprofiler-sdk/en/latest/install/installation.html).
### Resolved issues
- Fixed multi-GPU dimension mismatch.
- Fixed device lock issue for dispatch counters.
- Addressed OpenMP Tools task scheduling null pointer exception.
- Fixed stream ID errors arising during process attachment.
- Fixed issues arising during dynamic code object loading.
@@ -1004,6 +1004,11 @@ ROCPROFILER_ENUM_LABEL(ROCPROFILER_HIP_RUNTIME_API_ID_hipGetProcAddress_spt)
#if HIP_RUNTIME_API_TABLE_STEP_VERSION >= 20
ROCPROFILER_ENUM_LABEL(ROCPROFILER_HIP_RUNTIME_API_ID_hipKernelGetParamInfo)
#endif
#if HIP_RUNTIME_API_TABLE_STEP_VERSION >= 21
ROCPROFILER_ENUM_LABEL(ROCPROFILER_HIP_RUNTIME_API_ID_hipExtDisableLogging)
ROCPROFILER_ENUM_LABEL(ROCPROFILER_HIP_RUNTIME_API_ID_hipExtEnableLogging)
ROCPROFILER_ENUM_LABEL(ROCPROFILER_HIP_RUNTIME_API_ID_hipExtSetLoggingParams)
#endif
#if HIP_RUNTIME_API_TABLE_STEP_VERSION == 0
static_assert(ROCPROFILER_HIP_RUNTIME_API_ID_LAST == 442);
#elif HIP_RUNTIME_API_TABLE_STEP_VERSION == 1
@@ -1046,6 +1051,8 @@ static_assert(ROCPROFILER_HIP_RUNTIME_API_ID_LAST == 506);
static_assert(ROCPROFILER_HIP_RUNTIME_API_ID_LAST == 507);
#elif HIP_RUNTIME_API_TABLE_STEP_VERSION == 20
static_assert(ROCPROFILER_HIP_RUNTIME_API_ID_LAST == 508);
#elif HIP_RUNTIME_API_TABLE_STEP_VERSION == 21
static_assert(ROCPROFILER_HIP_RUNTIME_API_ID_LAST == 511);
#else
# if !defined(ROCPROFILER_UNSAFE_NO_VERSION_CHECK) && \
(defined(ROCPROFILER_CI) && ROCPROFILER_CI > 0)
@@ -3377,6 +3377,26 @@ typedef union rocprofiler_hip_api_args_t
size_t* paramSize;
} hipKernelGetParamInfo;
#endif
#if HIP_RUNTIME_API_TABLE_STEP_VERSION >= 21
struct
{
// Empty struct has a size of 0 in C but size of 1 in C++.
// Add the rocprofiler_hip_api_no_args struct to fix this
rocprofiler_hip_api_no_args no_args;
} hipExtDisableLogging;
struct
{
// Empty struct has a size of 0 in C but size of 1 in C++.
// Add the rocprofiler_hip_api_no_args struct to fix this
rocprofiler_hip_api_no_args no_args;
} hipExtEnableLogging;
struct
{
size_t log_level;
size_t log_size;
size_t log_mask;
} hipExtSetLoggingParams;
#endif
} rocprofiler_hip_api_args_t;
ROCPROFILER_EXTERN_C_FINI
@@ -575,6 +575,11 @@ typedef enum rocprofiler_hip_runtime_api_id_t // NOLINT(performance-enum-size)
#endif
#if HIP_RUNTIME_API_TABLE_STEP_VERSION >= 20
ROCPROFILER_HIP_RUNTIME_API_ID_hipKernelGetParamInfo,
#endif
#if HIP_RUNTIME_API_TABLE_STEP_VERSION >= 21
ROCPROFILER_HIP_RUNTIME_API_ID_hipExtDisableLogging,
ROCPROFILER_HIP_RUNTIME_API_ID_hipExtEnableLogging,
ROCPROFILER_HIP_RUNTIME_API_ID_hipExtSetLoggingParams,
#endif
ROCPROFILER_HIP_RUNTIME_API_ID_LAST,
} rocprofiler_hip_runtime_api_id_t;
@@ -623,6 +623,12 @@ ROCP_SDK_ENFORCE_ABI(::HipDispatchTable, hipGetProcAddress_spt_fn, 506);
ROCP_SDK_ENFORCE_ABI(::HipDispatchTable, hipKernelGetParamInfo_fn, 507);
#endif
#if HIP_RUNTIME_API_TABLE_STEP_VERSION >= 21
ROCP_SDK_ENFORCE_ABI(::HipDispatchTable, hipExtDisableLogging_fn, 508);
ROCP_SDK_ENFORCE_ABI(::HipDispatchTable, hipExtEnableLogging_fn, 509);
ROCP_SDK_ENFORCE_ABI(::HipDispatchTable, hipExtSetLoggingParams_fn, 510);
#endif
#if HIP_RUNTIME_API_TABLE_STEP_VERSION == 0
ROCP_SDK_ENFORCE_ABI_VERSIONING(::HipDispatchTable, 442)
#elif HIP_RUNTIME_API_TABLE_STEP_VERSION == 1
@@ -665,6 +671,8 @@ ROCP_SDK_ENFORCE_ABI_VERSIONING(::HipDispatchTable, 506)
ROCP_SDK_ENFORCE_ABI_VERSIONING(::HipDispatchTable, 507)
#elif HIP_RUNTIME_API_TABLE_STEP_VERSION == 20
ROCP_SDK_ENFORCE_ABI_VERSIONING(::HipDispatchTable, 508)
#elif HIP_RUNTIME_API_TABLE_STEP_VERSION == 21
ROCP_SDK_ENFORCE_ABI_VERSIONING(::HipDispatchTable, 511)
#else
INTERNAL_CI_ROCP_SDK_ENFORCE_ABI_VERSIONING(::HipDispatchTable, 0)
#endif
@@ -650,6 +650,11 @@ HIP_API_INFO_DEFINITION_V(ROCPROFILER_HIP_TABLE_ID_Runtime, ROCPROFILER_HIP_RUNT
HIP_API_INFO_DEFINITION_V(ROCPROFILER_HIP_TABLE_ID_Runtime, ROCPROFILER_HIP_RUNTIME_API_ID_hipKernelGetParamInfo, hipKernelGetParamInfo, hipKernelGetParamInfo_fn, kernel, paramIndex, paramOffset, paramSize);
#endif
#if HIP_RUNTIME_API_TABLE_STEP_VERSION >= 21
HIP_API_INFO_DEFINITION_0(ROCPROFILER_HIP_TABLE_ID_Runtime, ROCPROFILER_HIP_RUNTIME_API_ID_hipExtDisableLogging, hipExtDisableLogging, hipExtDisableLogging_fn);
HIP_API_INFO_DEFINITION_0(ROCPROFILER_HIP_TABLE_ID_Runtime, ROCPROFILER_HIP_RUNTIME_API_ID_hipExtEnableLogging, hipExtEnableLogging, hipExtEnableLogging_fn);
HIP_API_INFO_DEFINITION_V(ROCPROFILER_HIP_TABLE_ID_Runtime, ROCPROFILER_HIP_RUNTIME_API_ID_hipExtSetLoggingParams, hipExtSetLoggingParams, hipExtSetLoggingParams_fn, log_level, log_size, log_mask);
#endif
// clang-format on
#else
@@ -1,3 +1,6 @@
# Copyright (c) Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
#
# function for
#
@@ -142,11 +145,27 @@ function(rocprofiler_systems_causal_example_executable _NAME)
endif()
if(ROCPROFSYS_INSTALL_EXAMPLES)
install(
TARGETS ${_NAME} ${_NAME}-rocprofsys ${_NAME}-coz
DESTINATION bin
COMPONENT rocprofiler-systems-examples
OPTIONAL
set(_TARGETS
${_NAME}
${_NAME}-rocprofsys
${_NAME}-ndebug
${_NAME}-rocprofsys-ndebug
${_NAME}-coz
)
set(_EXISTING_TARGETS)
foreach(_TARGET IN LISTS _TARGETS)
if(TARGET ${_TARGET})
list(APPEND _EXISTING_TARGETS ${_TARGET})
endif()
endforeach()
if(_EXISTING_TARGETS)
install(
TARGETS ${_EXISTING_TARGETS}
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
endif()
endif()
endfunction()
@@ -1,3 +1,6 @@
# Copyright (c) Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(rocprofiler-systems-code-coverage-example LANGUAGES CXX)
@@ -22,7 +25,11 @@ target_link_libraries(code-coverage PRIVATE Threads::Threads)
target_compile_options(code-coverage PRIVATE ${_FLAGS})
if(ROCPROFSYS_INSTALL_EXAMPLES)
install(TARGETS code-coverage DESTINATION bin COMPONENT rocprofiler-systems-examples)
install(
TARGETS code-coverage
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
endif()
set(PYTHON_FILES code-coverage.py)
@@ -42,7 +49,7 @@ if(Python3_FOUND)
if(ROCPROFSYS_INSTALL_EXAMPLES)
install(
PROGRAMS ${PROJECT_BINARY_DIR}/${_FILE}
DESTINATION bin
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
endif()
@@ -1,3 +1,6 @@
# Copyright (c) Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(rocprofiler-systems-fork LANGUAGES CXX)
@@ -18,7 +21,13 @@ target_link_libraries(
target_compile_options(fork-example PRIVATE ${_FLAGS})
if(ROCPROFSYS_INSTALL_EXAMPLES)
install(TARGETS fork-example DESTINATION bin COMPONENT rocprofiler-systems-examples)
if(TARGET fork-example)
install(
TARGETS fork-example
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
endif()
endif()
# HIP fork example (multi-process concurrency test)
@@ -81,10 +90,10 @@ if(HIPCC_EXECUTABLE)
rocprofiler_systems_custom_compilation(COMPILER ${HIPCC_EXECUTABLE} TARGET hipMallocConcurrencyMproc)
endif()
if(ROCPROFSYS_INSTALL_EXAMPLES)
if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET hipMallocConcurrencyMproc)
install(
TARGETS hipMallocConcurrencyMproc
DESTINATION bin
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
endif()
@@ -1,25 +1,5 @@
################################################################################
# Copyright (c) 2024 - 2025 Advanced Micro Devices, Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
################################################################################
# Copyright (c) Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
@@ -157,11 +137,15 @@ if(HIP_FOUND AND rocjpeg_FOUND AND Threads_FOUND AND rocprofiler-register_FOUND)
target_compile_options(jpegdecode PRIVATE ${_FLAGS})
copy_image_files_and_make_copies()
if(ROCPROFSYS_INSTALL_EXAMPLES)
install(TARGETS jpegdecode DESTINATION bin COMPONENT rocprofiler-systems-examples)
if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET jpegdecode)
install(
FILES ${CMAKE_BINARY_DIR}/images
DESTINATION share/rocprofiler-systems/tests/images
TARGETS jpegdecode
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
install(
DIRECTORY ${CMAKE_BINARY_DIR}/images/
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples/images
COMPONENT rocprofiler-systems-examples
)
endif()
@@ -1,3 +1,6 @@
# Copyright (c) Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(rocprofiler-systems-lulesh-example LANGUAGES C CXX)
@@ -82,16 +85,11 @@ rocprofiler_systems_causal_example_executable(
INCLUDE_DIRECTORIES ${PROJECT_SOURCE_DIR}/includes
)
if(ROCPROFSYS_INSTALL_EXAMPLES)
if(LULESH_BUILD_KOKKOS)
install(
TARGETS kokkoscore kokkoscontainers
DESTINATION ${CMAKE_INSTALL_LIBDIR}
if(ROCPROFSYS_INSTALL_EXAMPLES AND LULESH_BUILD_KOKKOS)
install(
TARGETS kokkoscore kokkoscontainers kokkossimd
LIBRARY
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples/lib
COMPONENT rocprofiler-systems-examples
)
set_target_properties(
lulesh
PROPERTIES INSTALL_RPATH "\$ORIGIN/../${CMAKE_INSTALL_LIBDIR}"
)
endif()
)
endif()
@@ -1,3 +1,6 @@
# Copyright (c) Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(rocprofiler-systems-mpi-examples LANGUAGES C CXX)
@@ -77,16 +80,24 @@ add_executable(mpi-example mpi.cpp)
target_link_libraries(mpi-example PRIVATE mpi-cxx-interface-library)
if(ROCPROFSYS_INSTALL_EXAMPLES)
install(
TARGETS
mpi-example
mpi-allgather
mpi-bcast
mpi-all2all
mpi-reduce
mpi-scatter-gather
mpi-send-recv
DESTINATION bin
COMPONENT rocprofiler-systems-examples
set(MPI_EXAMPLES
mpi-example
mpi-allgather
mpi-bcast
mpi-all2all
mpi-reduce
mpi-scatter-gather
mpi-send-recv
mpi-allreduce
)
foreach(MPI_EXAMPLE IN LISTS MPI_EXAMPLES)
if(TARGET ${MPI_EXAMPLE})
install(
TARGETS ${MPI_EXAMPLE}
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
endif()
endforeach()
endif()
@@ -89,13 +89,17 @@ endif()
target_link_libraries(openmp-cg PRIVATE openmp-common)
target_link_libraries(openmp-lu PRIVATE openmp-common)
if(ROCPROFSYS_INSTALL_EXAMPLES)
install(
TARGETS openmp-cg openmp-lu
DESTINATION bin
COMPONENT rocprofiler-systems-examples
)
endif()
set(OPENMP_EXAMPLES openmp-cg openmp-lu)
foreach(OPENMP_EXAMPLE IN LISTS OPENMP_EXAMPLES)
if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET ${OPENMP_EXAMPLE})
install(
TARGETS ${OPENMP_EXAMPLE}
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
endif()
endforeach()
set(DEFAULT_GPU_TARGETS
"gfx900"
@@ -280,3 +280,19 @@ set(ROCPROFSYS_OMPVV_OFFLOAD_TESTS
rocprofiler_systems_message(STATUS
"Successfully configured OMPVV"
)
if(ROCPROFSYS_INSTALL_EXAMPLES)
foreach(
test_target
IN
LISTS ROCPROFSYS_OMPVV_HOST_TESTS ROCPROFSYS_OMPVV_OFFLOAD_TESTS
)
if(TARGET "${test_target}-build")
install(
PROGRAMS "${OMPVV_BIN_DEST}/${test_target}"
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
endif()
endforeach()
endif()
@@ -71,7 +71,7 @@ endif()
# Use the directory that actually contains the library we found
get_filename_component(_rocm_llvm_lib "${LIBOMPTARGET_SO}" DIRECTORY)
set(_rocm_clang_lib "${ROCM_ROOT_DIR}/lib")
set(_COMMON_RPATH "${_rocm_llvm_lib};${_rocm_clang_lib};$ORIGIN")
set(_COMMON_RPATH "${_rocm_llvm_lib};${_rocm_clang_lib};$ORIGIN;$ORIGIN/lib")
if(ROCmVersion_DIR)
list(APPEND _COMMON_RPATH "${ROCmVersion_DIR}/llvm/lib")
endif()
@@ -126,3 +126,15 @@ rocprofiler_systems_custom_compilation(
rocprofiler_systems_custom_compilation(TARGET openmp-target
COMPILER ${OMP_TARGET_COMPILER}
)
if(ROCPROFSYS_INSTALL_EXAMPLES)
if(TARGET openmp-target AND TARGET openmp-target-lib)
install(
TARGETS openmp-target openmp-target-lib
RUNTIME DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
LIBRARY
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples/lib
COMPONENT rocprofiler-systems-examples
)
endif()
endif()
@@ -1,3 +1,6 @@
# Copyright (c) Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(rocprofiler-systems-parallel-overhead-example LANGUAGES CXX)
@@ -32,10 +35,13 @@ target_link_libraries(
)
target_compile_definitions(parallel-overhead-locks PRIVATE USE_LOCKS=1)
if(ROCPROFSYS_INSTALL_EXAMPLES)
install(
TARGETS parallel-overhead parallel-overhead-locks
DESTINATION bin
COMPONENT rocprofiler-systems-examples
)
endif()
set(PARALLEL_OVERHEAD_EXAMPLES parallel-overhead parallel-overhead-locks)
foreach(PARALLEL_OVERHEAD_EXAMPLE IN LISTS PARALLEL_OVERHEAD_EXAMPLES)
if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET ${PARALLEL_OVERHEAD_EXAMPLE})
install(
TARGETS ${PARALLEL_OVERHEAD_EXAMPLE}
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
endif()
endforeach()
@@ -1,3 +1,6 @@
# Copyright (c) Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(rocprofiler-systems-python)
@@ -36,7 +39,7 @@ if(Python3_FOUND)
if(ROCPROFSYS_INSTALL_EXAMPLES)
install(
PROGRAMS ${PROJECT_BINARY_DIR}/${_FILE}
DESTINATION bin
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
endif()
@@ -138,6 +138,14 @@ if(hip_FOUND AND rccl_FOUND)
add_dependencies(rccl-tests::${_EXE_NAME} copy-${_EXE_NAME})
list(APPEND _RCCL_TEST_TARGETS "rccl-tests::${_EXE_NAME}")
if(ROCPROFSYS_INSTALL_EXAMPLES)
install(
PROGRAMS ${_EXE_DEST_PATH}
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
endif()
endforeach()
set(RCCL_TEST_TARGETS "${_RCCL_TEST_TARGETS}" CACHE INTERNAL "rccl-test targets")
@@ -1,3 +1,6 @@
# Copyright (c) Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(rocprofiler-systems-rewrite-caller-example LANGUAGES CXX)
@@ -18,6 +21,10 @@ set(CMAKE_BUILD_TYPE "Debug")
add_executable(rewrite-caller rewrite-caller.cpp)
target_compile_options(rewrite-caller PRIVATE ${_FLAGS})
if(ROCPROFSYS_INSTALL_EXAMPLES)
install(TARGETS rewrite-caller DESTINATION bin COMPONENT rocprofiler-systems-examples)
if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET rewrite-caller)
install(
TARGETS rewrite-caller
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
endif()
@@ -1,24 +1,5 @@
# MIT License
#
# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# Copyright (c) Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
@@ -113,6 +94,10 @@ if(NOT CMAKE_CXX_COMPILER_IS_HIPCC AND HIPCC_EXECUTABLE)
rocprofiler_systems_custom_compilation(COMPILER ${HIPCC_EXECUTABLE} TARGET roctx)
endif()
if(ROCPROFSYS_INSTALL_EXAMPLES)
install(TARGETS roctx DESTINATION bin COMPONENT rocprofiler-systems-examples)
if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET roctx)
install(
TARGETS roctx
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
endif()
@@ -40,3 +40,11 @@ target_compile_options(tests-compile-options INTERFACE -g)
add_executable(thread-limit thread-limit.cpp)
target_compile_definitions(thread-limit PRIVATE MAX_THREADS=${ROCPROFSYS_MAX_THREADS})
target_link_libraries(thread-limit PRIVATE Threads::Threads tests-compile-options)
if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET thread-limit)
install(
TARGETS thread-limit
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
endif()
@@ -1,3 +1,6 @@
# Copyright (c) Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(rocprofiler-systems-trace-time-window-example LANGUAGES CXX)
@@ -18,10 +21,10 @@ set(CMAKE_BUILD_TYPE "Debug")
add_executable(trace-time-window trace-time-window.cpp)
target_compile_options(trace-time-window PRIVATE ${_FLAGS})
if(ROCPROFSYS_INSTALL_EXAMPLES)
if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET trace-time-window)
install(
TARGETS trace-time-window
DESTINATION bin
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
endif()
@@ -1,3 +1,6 @@
# Copyright (c) Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(rocprofiler-systems-transferBench-example LANGUAGES CXX)
@@ -120,6 +123,10 @@ if(NOT CMAKE_CXX_COMPILER_IS_HIPCC AND HIPCC_EXECUTABLE)
rocprofiler_systems_custom_compilation(COMPILER ${HIPCC_EXECUTABLE} TARGET transferBench)
endif()
if(ROCPROFSYS_INSTALL_EXAMPLES)
install(TARGETS transferBench DESTINATION bin COMPONENT rocprofiler-systems-examples)
if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET transferBench)
install(
TARGETS transferBench
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
endif()
@@ -1,3 +1,6 @@
# Copyright (c) Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(rocprofiler-systems-transpose-example LANGUAGES CXX)
@@ -93,6 +96,10 @@ if(NOT CMAKE_CXX_COMPILER_IS_HIPCC AND HIPCC_EXECUTABLE)
rocprofiler_systems_custom_compilation(COMPILER ${HIPCC_EXECUTABLE} TARGET transpose)
endif()
if(ROCPROFSYS_INSTALL_EXAMPLES)
install(TARGETS transpose DESTINATION bin COMPONENT rocprofiler-systems-examples)
if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET transpose)
install(
TARGETS transpose
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
endif()
@@ -1,3 +1,6 @@
# Copyright (c) Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(rocprofiler-systems-user-api-example LANGUAGES CXX)
@@ -24,6 +27,10 @@ target_link_libraries(
PRIVATE Threads::Threads rocprofiler-systems::rocprofiler-systems-user-library
)
if(ROCPROFSYS_INSTALL_EXAMPLES)
install(TARGETS user-api DESTINATION bin COMPONENT rocprofiler-systems-examples)
if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET user-api)
install(
TARGETS user-api
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
endif()
@@ -1,3 +1,6 @@
# Copyright (c) Advanced Micro Devices, Inc.
# SPDX-License-Identifier: MIT
cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
project(rocprofiler-systems-videodecode-example LANGUAGES CXX)
@@ -176,15 +179,15 @@ if(FFMPEG_FOUND AND rocdecode_FOUND)
target_compile_definitions(videodecode PUBLIC USE_AVCODEC_GREATER_THAN_58_134=1)
endif()
if(ROCPROFSYS_INSTALL_EXAMPLES)
if(ROCPROFSYS_INSTALL_EXAMPLES AND TARGET videodecode)
install(
TARGETS videodecode
DESTINATION bin
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples
COMPONENT rocprofiler-systems-examples
)
install(
FILES ${CMAKE_BINARY_DIR}/videos
DESTINATION share/rocprofiler-systems/tests/videos
DIRECTORY ${CMAKE_BINARY_DIR}/videos/
DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/rocprofiler-systems/examples/videos
COMPONENT rocprofiler-systems-examples
)
endif()
@@ -192,6 +192,10 @@ prepare_environment_for_run(parser_data_t& _data)
rocprofsys::argparse::add_ld_preload(_data);
rocprofsys::argparse::add_ld_library_path(_data);
}
rocprofsys::argparse::add_torch_library_path(_data, _data.verbose > 0);
rocprofsys::common::consolidate_env_entries(_data.current);
}
void
@@ -933,3 +933,9 @@ parse_args(int argc, char** argv, std::vector<char*>& _env)
return _outv;
}
void
add_torch_library_path(std::vector<char*>& envp, const std::vector<char*>& argv)
{
rocprofsys::common::add_torch_library_path(envp, argv, verbose > 0, updated_envs);
}
@@ -51,6 +51,8 @@ main(int argc, char** argv)
_argv.emplace_back(argv[i]);
}
add_torch_library_path(_env, _argv);
print_updated_environment(_env);
if(!_argv.empty())
@@ -35,3 +35,6 @@ get_initial_environment();
std::vector<char*>
parse_args(int argc, char** argv, std::vector<char*>& envp);
void
add_torch_library_path(std::vector<char*>& envp, const std::vector<char*>& argv);
@@ -26,9 +26,12 @@
#include "common/join.hpp"
#include <algorithm>
#include <cctype>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <numeric>
#include <sstream>
#include <stdexcept>
#include <string>
#include <string_view>
@@ -197,7 +200,7 @@ remove_env(std::vector<char*>& _environ, std::string_view _env_var,
{
if(match(itr))
{
free(itr);
std::free(itr);
itr = nullptr;
}
}
@@ -266,6 +269,113 @@ discover_llvm_libdir_for_ompt(bool verbose = false)
return {};
}
inline bool
is_python_interpreter(std::string_view executable)
{
if(executable.empty()) return false;
const auto slash_pos = executable.rfind('/');
const auto basename = (slash_pos != std::string_view::npos)
? executable.substr(slash_pos + 1)
: executable;
if(basename == "python" || basename == "python3") return true;
constexpr std::string_view python3_prefix = "python3.";
const bool has_valid_prefix =
basename.size() > python3_prefix.size() &&
basename.substr(0, python3_prefix.size()) == python3_prefix;
if(!has_valid_prefix) return false;
const auto version_digits = basename.substr(python3_prefix.size());
return std::all_of(version_digits.begin(), version_digits.end(),
[](unsigned char c) { return std::isdigit(c); });
}
inline std::string
discover_torch_libpath(const std::string& python_binary, bool verbose = false)
{
if(python_binary.empty()) return {};
const auto is_safe_executable_path = [](const std::string& path) {
// Allow only a conservative set of characters in the executable path to
// avoid injection when used in a shell command.
for(unsigned char c : path)
{
if(std::isalnum(c) != 0) continue;
switch(c)
{
case '/':
case '.':
case '_':
case '-':
case '+': break;
default: return false;
}
}
return true;
};
if(!is_safe_executable_path(python_binary))
{
ROCPROFSYS_ENVIRON_LOG(
verbose, "Unsafe characters detected in Python interpreter path: %s\n",
python_binary.c_str());
return {};
}
const auto cmd = "\"" + python_binary +
"\" -c \"import torch; print(torch.__path__[0])\" 2>/dev/null";
FILE* pipe = popen(cmd.c_str(), "r");
if(!pipe)
{
ROCPROFSYS_ENVIRON_LOG(verbose, "Failed to execute command: %s\n", cmd.c_str());
return {};
}
char buffer[1024];
std::string result;
while(fgets(buffer, sizeof(buffer), pipe))
{
result.append(buffer);
// stop if we've read the full line (torch path is printed on a single line)
if(!result.empty() && result.back() == '\n') break;
}
int status = pclose(pipe);
if(status != 0 || result.empty())
{
ROCPROFSYS_ENVIRON_LOG(verbose, "torch not found for Python interpreter: %s\n",
python_binary.c_str());
return {};
}
while(!result.empty() &&
(result.back() == '\n' || result.back() == '\r' || result.back() == ' '))
{
result.pop_back();
}
if(result.empty()) return {};
std::string torch_libdir = result + "/lib";
if(!::tim::filepath::direxists(torch_libdir))
{
ROCPROFSYS_ENVIRON_LOG(verbose, "torch lib directory does not exist: %s\n",
torch_libdir.c_str());
return {};
}
ROCPROFSYS_ENVIRON_LOG(verbose, "Discovered torch library path: %s\n",
torch_libdir.c_str());
return torch_libdir;
}
enum class update_mode : uint8_t
{
REPLACE = 0,
@@ -335,7 +445,7 @@ update_env(std::vector<char*>& _environ, std::string_view _env_var, Tp&& _env_va
}
else
{
free(itr);
std::free(itr);
itr = strdup(join('=', _env_var, _env_val_str).c_str());
}
return;
@@ -343,5 +453,145 @@ update_env(std::vector<char*>& _environ, std::string_view _env_var, Tp&& _env_va
_environ.emplace_back(strdup(join('=', _env_var, _env_val_str).c_str()));
}
template <typename UpdatedEnvsT>
inline void
add_torch_library_path(std::vector<char*>& envp, const std::vector<char*>& argv,
bool verbose, UpdatedEnvsT& updated_envs)
{
if(argv.empty() || argv.front() == nullptr) return;
if(!is_python_interpreter(argv.front())) return;
auto torch_libpath = discover_torch_libpath(argv.front(), verbose);
if(torch_libpath.empty()) return;
std::unordered_set<std::string> seen{ torch_libpath };
std::string result = torch_libpath;
constexpr std::string_view ld_prefix = "LD_LIBRARY_PATH=";
auto is_ld_path = [&](char* entry) {
return entry &&
std::string_view{ entry }.substr(0, ld_prefix.length()) == ld_prefix;
};
for(auto& entry : envp)
{
if(!is_ld_path(entry)) continue;
std::istringstream stream{ std::string{ entry + ld_prefix.length() } };
for(std::string path; std::getline(stream, path, ':');)
{
if(!path.empty() && seen.insert(path).second) result += ":" + path;
}
std::free(entry);
entry = nullptr;
}
envp.erase(std::remove(envp.begin(), envp.end(), nullptr), envp.end());
envp.emplace_back(strdup(join("", ld_prefix, result).c_str()));
updated_envs.emplace(ld_prefix.substr(0, ld_prefix.length() - 1));
}
inline void
consolidate_env_entries(std::vector<char*>& envp)
{
constexpr char delim = ':';
struct key_data
{
std::vector<std::string> parts;
std::unordered_set<std::string> seen;
void add_unique(std::string part)
{
if(!part.empty() && seen.insert(part).second)
parts.emplace_back(std::move(part));
}
};
auto parse_entry = [](std::string_view entry)
-> std::optional<std::pair<std::string_view, std::string_view>> {
auto eq_pos = entry.find('=');
if(eq_pos == std::string_view::npos) return std::nullopt;
return std::make_pair(entry.substr(0, eq_pos), entry.substr(eq_pos + 1));
};
auto join_parts = [delim](std::string_view key,
const std::vector<std::string>& parts) {
std::string result;
const auto total_parts_length = std::accumulate(
parts.begin(), parts.end(), std::size_t{ 0 },
[](std::size_t acc, const std::string& part) { return acc + part.size(); });
const auto delim_count = parts.size() - 1;
const auto equal_sign_length = 1;
result.reserve(key.size() + equal_sign_length + total_parts_length + delim_count);
result.append(key);
result += '=';
result =
std::accumulate(parts.begin(), parts.end(), std::move(result),
[delim, &parts](std::string acc, const std::string& part) {
if(part != parts.front()) acc += delim;
acc.append(part);
return acc;
});
return result;
};
std::unordered_map<std::string_view, key_data> key_map;
std::vector<std::string_view> key_order;
for(auto* entry : envp)
{
if(!entry)
{
continue;
}
auto parsed = parse_entry(entry);
if(!parsed)
{
continue;
}
auto [key, value] = *parsed;
auto [it, inserted] = key_map.try_emplace(key);
if(inserted)
{
key_order.emplace_back(key);
}
auto& data = it->second;
std::istringstream stream{ std::string{ value } };
for(std::string part; std::getline(stream, part, delim);)
{
data.add_unique(part);
}
}
std::vector<char*> result;
result.reserve(key_order.size());
for(auto key : key_order)
{
result.emplace_back(strdup(join_parts(key, key_map[key].parts).c_str()));
}
for(auto* entry : envp)
{
std::free(entry);
entry = nullptr;
}
envp = std::move(result);
}
} // namespace common
} // namespace rocprofsys
@@ -24,6 +24,7 @@ add_library(
lib-common-tests
OBJECT
test_discover_llvm_libdir.cpp
test_environment.cpp
test_path.cpp
test_remove_env.cpp
test_update_env.cpp
@@ -0,0 +1,146 @@
// Copyright (c) Advanced Micro Devices, Inc.
// SPDX-License-Identifier: MIT
#include "common/environment.hpp"
#include <gtest/gtest.h>
using namespace rocprofsys::common;
class IsPythonInterpreterTest : public ::testing::Test
{};
TEST_F(IsPythonInterpreterTest, RecognizesPython)
{
EXPECT_TRUE(is_python_interpreter("python"));
EXPECT_TRUE(is_python_interpreter("python3"));
EXPECT_TRUE(is_python_interpreter("python3.8"));
EXPECT_TRUE(is_python_interpreter("python3.9"));
EXPECT_TRUE(is_python_interpreter("python3.10"));
EXPECT_TRUE(is_python_interpreter("python3.11"));
EXPECT_TRUE(is_python_interpreter("python3.12"));
EXPECT_TRUE(is_python_interpreter("/usr/bin/python"));
EXPECT_TRUE(is_python_interpreter("/usr/bin/python3"));
EXPECT_TRUE(is_python_interpreter("/usr/bin/python3.10"));
EXPECT_TRUE(is_python_interpreter("/home/user/venv/bin/python"));
EXPECT_TRUE(is_python_interpreter("/opt/conda/bin/python3.11"));
EXPECT_FALSE(is_python_interpreter("bash"));
EXPECT_FALSE(is_python_interpreter("sh"));
EXPECT_FALSE(is_python_interpreter("ruby"));
EXPECT_FALSE(is_python_interpreter("node"));
EXPECT_FALSE(is_python_interpreter("java"));
EXPECT_FALSE(is_python_interpreter("/usr/bin/bash"));
EXPECT_FALSE(is_python_interpreter("./my_app"));
EXPECT_FALSE(is_python_interpreter("pythonista"));
EXPECT_FALSE(is_python_interpreter("python_script.py"));
EXPECT_FALSE(is_python_interpreter("mypython"));
EXPECT_FALSE(is_python_interpreter("python2"));
EXPECT_FALSE(is_python_interpreter("python3."));
EXPECT_FALSE(is_python_interpreter("python3.a"));
EXPECT_FALSE(is_python_interpreter("python3.10a"));
EXPECT_FALSE(is_python_interpreter("python3x10"));
EXPECT_FALSE(is_python_interpreter(""));
EXPECT_FALSE(is_python_interpreter("/usr/bin/"));
}
class DuplicatedEnvironmentEntriesTest : public ::testing::Test
{};
TEST_F(DuplicatedEnvironmentEntriesTest, DuplicateEnvironmentEntries)
{
std::vector<char*> env_vars = {
strdup("PATH=/usr/local/bin:/usr/bin:/bin:/usr/local/bin2"),
strdup("PATH=/usr/local/bin:/usr/bin:/bin"),
};
consolidate_env_entries(env_vars);
ASSERT_EQ(env_vars.size(), 1);
EXPECT_STREQ(env_vars[0], "PATH=/usr/local/bin:/usr/bin:/bin:/usr/local/bin2");
for(auto* entry : env_vars)
free(entry);
}
TEST_F(DuplicatedEnvironmentEntriesTest, HandlesEmptyVector)
{
std::vector<char*> env_vars;
consolidate_env_entries(env_vars);
EXPECT_TRUE(env_vars.empty());
}
TEST_F(DuplicatedEnvironmentEntriesTest, HandlesNullEntries)
{
std::vector<char*> env_vars = {
strdup("PATH=/usr/bin"),
nullptr,
strdup("PATH=/bin"),
};
consolidate_env_entries(env_vars);
ASSERT_EQ(env_vars.size(), 1);
EXPECT_STREQ(env_vars[0], "PATH=/usr/bin:/bin");
for(auto* entry : env_vars)
std::free(entry);
}
TEST_F(DuplicatedEnvironmentEntriesTest, HandlesEmptyValues)
{
std::vector<char*> env_vars = {
strdup("EMPTY_VAR="),
strdup("PATH=/usr/bin"),
};
consolidate_env_entries(env_vars);
ASSERT_EQ(env_vars.size(), 2);
for(auto* entry : env_vars)
std::free(entry);
}
class AddTorchLibraryPathTest : public ::testing::Test
{
protected:
std::unordered_set<std::string> updated_envs;
};
TEST_F(AddTorchLibraryPathTest, SkipsNonPythonExecutables)
{
std::vector<char*> envp = {
strdup("LD_LIBRARY_PATH=/usr/lib"),
};
std::vector<char*> argv = {
strdup("/usr/bin/bash"),
};
add_torch_library_path(envp, argv, false, updated_envs);
// Should not modify environment
ASSERT_EQ(envp.size(), 1);
EXPECT_STREQ(envp[0], "LD_LIBRARY_PATH=/usr/lib");
for(auto* entry : envp)
std::free(entry);
for(auto* entry : argv)
std::free(entry);
}
TEST_F(AddTorchLibraryPathTest, HandlesEmptyArgv)
{
std::vector<char*> envp = {
strdup("LD_LIBRARY_PATH=/usr/lib"),
};
std::vector<char*> argv;
add_torch_library_path(envp, argv, false, updated_envs);
ASSERT_EQ(envp.size(), 1);
EXPECT_STREQ(envp[0], "LD_LIBRARY_PATH=/usr/lib");
for(auto* entry : envp)
std::free(entry);
}
TEST_F(AddTorchLibraryPathTest, HandlesNullArgvFront)
{
std::vector<char*> envp = {
strdup("LD_LIBRARY_PATH=/usr/lib"),
};
std::vector<char*> argv = { nullptr };
add_torch_library_path(envp, argv, false, updated_envs);
ASSERT_EQ(envp.size(), 1);
for(auto* entry : envp)
std::free(entry);
}
@@ -168,6 +168,14 @@ add_ld_library_path(parser_data& _data)
return _data;
}
parser_data&
add_torch_library_path(parser_data& _data, bool verbose)
{
rocprofsys::common::add_torch_library_path(_data.current, _data.command, verbose,
_data.updated);
return _data;
}
parser_data&
add_core_arguments(parser_t& _parser, parser_data& _data)
{
@@ -83,6 +83,9 @@ add_ld_preload(parser_data&);
parser_data&
add_ld_library_path(parser_data&);
parser_data&
add_torch_library_path(parser_data&, bool verbose = false);
parser_data&
add_core_arguments(parser_t&, parser_data&);
@@ -316,7 +316,9 @@ configure_settings(bool _init)
"backend", "perfetto");
ROCPROFSYS_CONFIG_SETTING(bool, "ROCPROFSYS_TRACE_LEGACY",
"Use legacy direct mode for perfetto tracing instead of "
"[DEPRECATED] The new default option is to use data from "
"cached buffer. When set to true system will use "
"legacy direct mode for perfetto tracing instead of "
"deferred trace generation. When false (default), uses "
"cached mode with minimal runtime overhead.",
false, "backend", "perfetto");
@@ -1088,6 +1090,9 @@ configure_settings(bool _init)
handle_deprecated_setting("ROCPROFSYS_OUTPUT_FILE", "ROCPROFSYS_PERFETTO_FILE");
handle_deprecated_setting("ROCPROFSYS_USE_PERFETTO", "ROCPROFSYS_TRACE");
handle_deprecated_setting("ROCPROFSYS_USE_TIMEMORY", "ROCPROFSYS_PROFILE");
handle_deprecated_setting("ROCPROFSYS_DEBUG", "ROCPROFSYS_LOG_LEVEL");
handle_deprecated_setting("ROCPROFSYS_VERBOSE", "ROCPROFSYS_LOG_LEVEL");
handle_deprecated_setting("ROCPROFSYS_TRACE_LEGACY", "ROCPROFSYS_TRACE");
scope::get_fields()[scope::flat::value] = _config->get_flat_profile();
scope::get_fields()[scope::timeline::value] = _config->get_timeline_profile();
@@ -31,6 +31,8 @@
} \
} // namespace ::tim::cereal
#include "common/defines.h"
#if !defined(ROCPROFSYS_USE_ROCM)
# define ROCPROFSYS_USE_ROCM 0
#endif
@@ -40,7 +42,6 @@
#include <timemory/manager.hpp>
#include <dlfcn.h>
#include <string>
#include "core/agent_manager.hpp"
@@ -92,17 +93,6 @@ _amdsmi_is_initialized()
return initialized;
}
void
prevent_amdsmi_library_unload()
{
static bool _initialized = false;
if(_initialized) return;
_initialized = true;
dlopen("libamd_smi.so", RTLD_NOW | RTLD_NOLOAD | RTLD_NODELETE);
dlopen("librocm_smi64.so", RTLD_NOW | RTLD_NOLOAD | RTLD_NODELETE);
}
bool
amdsmi_init()
{
@@ -113,8 +103,6 @@ amdsmi_init()
ROCPROFSYS_AMD_SMI_CALL(::amdsmi_init(AMDSMI_INIT_AMD_GPUS));
get_processor_handles();
_amdsmi_is_initialized() = true; // Mark as initialized
prevent_amdsmi_library_unload();
} catch(std::exception& _e)
{
LOG_ERROR("Exception thrown initializing amd-smi: {}", _e.what());
@@ -367,7 +367,6 @@ config_settings(const std::shared_ptr<settings>& _config)
_skip_domains.emplace("kernel_dispatch");
_skip_domains.emplace("page_migration");
_skip_domains.emplace("scratch_memory");
_add_operation_settings(
"MARKER_API", callback_tracing_info[ROCPROFILER_CALLBACK_TRACING_MARKER_CORE_API],
@@ -652,7 +651,7 @@ get_backtrace_operations(rocprofiler_callback_tracing_kind_t kindv)
{
if(callback_operation_option_names.count(kindv) == 0)
{
LOG_CRITICAL("callback_operation_operation_names does not have value for {}",
LOG_CRITICAL("callback_operation_option_names does not have value for {}",
static_cast<int>(kindv));
::rocprofsys::set_state(::rocprofsys::State::Finalized);
std::abort();
@@ -41,7 +41,7 @@ using storage_parser_t =
storage_parser<type_identifier_t, kernel_dispatch_sample, memory_copy_sample,
memory_allocate_sample, region_sample, in_time_sample,
pmc_event_with_sample, amd_smi_sample, cpu_freq_sample,
backtrace_region_sample>;
backtrace_region_sample, scratch_memory_sample>;
using buffer_storage_t = buffer_storage<flush_worker_factory_t, type_identifier_t>;
@@ -540,6 +540,72 @@ perfetto_processor_t::handle([[maybe_unused]] const kernel_dispatch_sample& _kds
#endif
}
void
perfetto_processor_t::handle([[maybe_unused]] const scratch_memory_sample& _sms)
{
#if ROCPROFSYS_USE_ROCM > 0
auto _corr_id = _sms.correlation_id_internal;
auto _stream_id = _sms.stream_handle;
auto _queue_id_handle = _sms.queue_id_handle;
const auto& _t_info = thread_info::get(_sms.thread_id, SystemTID);
const auto _thread_id_sequent = _t_info->index_data->sequent_value;
auto _beg_ts = _sms.start_timestamp;
auto _end_ts = _sms.end_timestamp;
auto _agent_device_id =
m_agent_manager.get_agent_by_handle(_sms.agent_id_handle).device_type_index;
auto _name = std::string{ m_metadata.get_buffer_name_info().at(
static_cast<rocprofiler_buffer_tracing_kind_t>(_sms.kind),
static_cast<rocprofiler_tracing_operation_t>(_sms.operation)) };
// Scratch memory samples from SDK versions prior to 7.0.2 do not include
// allocation_size field, so counter tracks are not needed
# if ROCPROFSYS_ROCM_VERSION >= 70002
using counter_track =
perfetto_counter_track<rocprofiler_buffer_tracing_scratch_memory_record_t>;
if(!counter_track::exists(_agent_device_id))
{
auto _track_desc_alloc_size = JOIN("", "GPU Scratch Memory [", _agent_device_id,
"] Thread ", _thread_id_sequent);
counter_track::emplace(_agent_device_id, _track_desc_alloc_size, "bytes");
}
if(_sms.operation == ROCPROFILER_SCRATCH_MEMORY_ALLOC)
{
TRACE_COUNTER("rocm_scratch_memory", counter_track::at(_agent_device_id, 0),
_beg_ts, _sms.allocation_size);
}
# endif
auto _track_desc_events = [&]() {
return JOIN("", "GPU Scratch Memory Events Thread ", _thread_id_sequent);
};
const auto _track =
tracing::get_perfetto_track(category::rocm_scratch_memory{}, _track_desc_events);
auto add_perfetto_annotations = [&](::perfetto::EventContext ctx) {
if(!m_use_annotations) return;
annotate_perfetto(ctx, { { "begin_ns", _beg_ts },
{ "end_ns", _end_ts },
{ "corr_id", _corr_id },
{ "stream_id", _stream_id },
{ "queue", _queue_id_handle },
{ "allocation_size", _sms.allocation_size },
{ "agent_id", _agent_device_id },
{ "operation", _name },
{ "flags", _sms.flags } });
};
tracing::push_perfetto(category::rocm_scratch_memory{}, _name.c_str(), _track,
_beg_ts, ::perfetto::Flow::ProcessScoped(_corr_id),
add_perfetto_annotations);
tracing::pop_perfetto(category::rocm_scratch_memory{}, "", _track, _end_ts);
#endif
}
void
perfetto_processor_t::handle([[maybe_unused]] const memory_copy_sample& _mcs)
{
@@ -56,6 +56,7 @@ public:
void finalize_processing();
void handle(const kernel_dispatch_sample& sample);
void handle(const scratch_memory_sample& sample);
void handle(const memory_copy_sample& sample);
void handle(const memory_allocate_sample& sample);
void handle(const region_sample& sample);
@@ -66,6 +66,37 @@ get_handle_from_code_object(
# endif
}
#endif
#if ROCPROFSYS_USE_ROCM > 0
using memory_operation = std::string;
using memory_type = std::string;
std::pair<memory_operation, memory_type>
parse_memory_operation_name(std::string_view memory_operation_name)
{
static const std::unordered_map<std::string_view,
std::pair<memory_operation, memory_type>>
parsing_map{
{ "MEMORY_ALLOCATION_NONE", { "NONE", "REAL" } },
{ "MEMORY_ALLOCATION_ALLOCATE", { "ALLOC", "REAL" } },
{ "MEMORY_ALLOCATION_VMEM_ALLOCATE", { "ALLOC", "VIRTUAL" } },
{ "MEMORY_ALLOCATION_FREE", { "FREE", "REAL" } },
{ "MEMORY_ALLOCATION_VMEM_FREE", { "FREE", "VIRTUAL" } },
{ "SCRATCH_MEMORY_NONE", { "NONE", "SCRATCH" } },
{ "SCRATCH_MEMORY_ALLOC", { "ALLOC", "SCRATCH" } },
{ "SCRATCH_MEMORY_FREE", { "FREE", "SCRATCH" } },
{ "SCRATCH_MEMORY_ASYNC_RECLAIM", { "ASYNC_RECLAIM", "SCRATCH" } },
};
auto item = parsing_map.find(memory_operation_name);
if(item == parsing_map.end())
{
LOG_WARNING("Unknown memory operation name: {}", memory_operation_name);
return { "UNKNOWN", "UNKNOWN" };
}
return item->second;
}
#endif
} // namespace
void
@@ -110,6 +141,46 @@ rocpd_processor_t::handle([[maybe_unused]] const kernel_dispatch_sample& _kds)
#endif
}
void
rocpd_processor_t::handle([[maybe_unused]] const scratch_memory_sample& _sms)
{
#if ROCPROFSYS_USE_ROCM > 0
auto& n_info = node_info::get_instance();
auto process = m_metadata->get_process_info();
const auto* _name = m_metadata->get_buffer_name_info().at(
static_cast<rocprofiler_buffer_tracing_kind_t>(_sms.kind),
static_cast<rocprofiler_tracing_operation_t>(_sms.operation));
auto agent_primary_key =
m_agent_manager->get_agent_by_handle(_sms.agent_id_handle).base_id;
auto thread_primary_key =
m_data_processor->map_thread_id_to_primary_key(_sms.thread_id);
auto category_primary_key = m_data_processor->insert_string(
trait::name<category::rocm_scratch_memory>::value);
auto stack_id = _sms.correlation_id_internal;
auto parent_stack_id = _sms.correlation_id_ancestor;
auto correlation_id = 0;
auto address_value = 0;
auto event_primary_key = m_data_processor->insert_event(
category_primary_key, stack_id, parent_stack_id, correlation_id);
auto [memory_operation, memory_type] = parse_memory_operation_name(_name);
auto extdata_json_str = JOIN("", "{\"flags\": ", _sms.flags, "}");
m_data_processor->insert_memory_alloc(
n_info.id, process.pid, thread_primary_key, agent_primary_key,
memory_operation.c_str(), memory_type.c_str(), _sms.start_timestamp,
_sms.end_timestamp, address_value, _sms.allocation_size, _sms.queue_id_handle,
_sms.stream_handle, event_primary_key, extdata_json_str.c_str());
#endif
}
void
rocpd_processor_t::handle([[maybe_unused]] const memory_copy_sample& _mcs)
{
@@ -153,46 +224,6 @@ void
rocpd_processor_t::handle([[maybe_unused]] const memory_allocate_sample& _mas)
{
#if ROCPROFSYS_USE_ROCM > 0 && (ROCPROFILER_VERSION >= 600)
static auto memtype_to_db =
[](std::string_view memory_type) -> std::pair<std::string, std::string> {
constexpr auto MEMORY_PREFIX = std::string_view{ "MEMORY_ALLOCATION_" };
constexpr auto SCRATCH_PREFIX = std::string_view{ "SCRATCH_MEMORY_" };
constexpr auto VMEM_PREFIX = std::string_view{ "VMEM_" };
constexpr auto ASYNC_PREFIX = std::string_view{ "ASYNC_" };
std::string _type;
std::string _level;
if(memory_type.find(MEMORY_PREFIX) == 0)
{
_type = memory_type.substr(MEMORY_PREFIX.length());
if(_type.find(VMEM_PREFIX) == 0)
{
_type = _type.substr(VMEM_PREFIX.length());
_level = "VIRTUAL";
}
else
{
_level = "REAL";
}
}
else if(memory_type.find(SCRATCH_PREFIX) == 0)
{
_type = memory_type.substr(SCRATCH_PREFIX.length());
_level = "SCRATCH";
if(memory_type.find(ASYNC_PREFIX) == 0)
{
_type = memory_type.substr(ASYNC_PREFIX.length()); // RECLAIM
}
}
if(_type == "ALLOCATE")
{
_type = "ALLOC";
}
return std::make_pair(_type, _level);
};
auto& n_info = node_info::get_instance();
auto process = m_metadata->get_process_info();
auto thread_primary_key =
@@ -210,7 +241,7 @@ rocpd_processor_t::handle([[maybe_unused]] const memory_allocate_sample& _mas)
static_cast<rocprofiler_buffer_tracing_kind_t>(_mas.kind),
static_cast<rocprofiler_tracing_operation_t>(_mas.operation));
auto [type, level] = memtype_to_db(_name);
auto [memory_operation, memory_type] = parse_memory_operation_name(_name);
auto stack_id = _mas.correlation_id_internal;
auto parent_stack_id = _mas.correlation_id_ancestor;
@@ -224,9 +255,10 @@ rocpd_processor_t::handle([[maybe_unused]] const memory_allocate_sample& _mas)
category_primary_key, stack_id, parent_stack_id, correlation_id);
m_data_processor->insert_memory_alloc(
n_info.id, process.pid, thread_primary_key, agent_primary_key, type.c_str(),
level.c_str(), _mas.start_timestamp, _mas.end_timestamp, _mas.address_value,
_mas.allocation_size, queue_id, _mas.stream_handle, event_primary_key);
n_info.id, process.pid, thread_primary_key, agent_primary_key,
memory_operation.c_str(), memory_type.c_str(), _mas.start_timestamp,
_mas.end_timestamp, _mas.address_value, _mas.allocation_size, queue_id,
_mas.stream_handle, event_primary_key);
}
#endif
}
@@ -45,6 +45,7 @@ public:
void finalize_processing();
void handle(const kernel_dispatch_sample& sample);
void handle(const scratch_memory_sample& sample);
void handle(const memory_copy_sample& sample);
void handle(const memory_allocate_sample& sample);
void handle(const region_sample& sample);
@@ -43,6 +43,11 @@ struct processor_t
static_cast<T*>(this)->handle(sample);
}
void handle(const scratch_memory_sample& sample)
{
static_cast<T*>(this)->handle(sample);
}
void handle(const memory_copy_sample& sample)
{
static_cast<T*>(this)->handle(sample);
@@ -84,6 +89,7 @@ protected:
struct processor_view_t
{
using kernel_dispatch_fn_t = void (*)(void*, const kernel_dispatch_sample&) noexcept;
using scratch_memory_fn_t = void (*)(void*, const scratch_memory_sample&) noexcept;
using memory_copy_fn_t = void (*)(void*, const memory_copy_sample&) noexcept;
#if(ROCPROFILER_VERSION >= 600)
using memory_allocate_fn_t = void (*)(void*, const memory_allocate_sample&) noexcept;
@@ -101,6 +107,7 @@ struct processor_view_t
struct vtable_t
{
kernel_dispatch_fn_t handle_kernel_dispatch;
scratch_memory_fn_t handle_scratch_memory;
memory_copy_fn_t handle_memory_copy;
#if(ROCPROFILER_VERSION >= 600)
memory_allocate_fn_t handle_memory_allocate;
@@ -134,6 +141,11 @@ struct processor_view_t
m_vtable->handle_kernel_dispatch(m_object, sample);
}
ROCPROFSYS_INLINE void handle(const scratch_memory_sample& sample) const noexcept
{
m_vtable->handle_scratch_memory(m_object, sample);
}
ROCPROFSYS_INLINE void handle(const memory_copy_sample& sample) const noexcept
{
m_vtable->handle_memory_copy(m_object, sample);
@@ -194,6 +206,9 @@ private:
+[](void* obj, const kernel_dispatch_sample& sample) noexcept {
static_cast<T*>(obj)->handle(sample);
},
+[](void* obj, const scratch_memory_sample& sample) noexcept {
static_cast<T*>(obj)->handle(sample);
},
+[](void* obj, const memory_copy_sample& sample) noexcept {
static_cast<T*>(obj)->handle(sample);
},
@@ -275,6 +290,9 @@ struct sample_processor_t
case type_identifier_t::kernel_dispatch:
handle_sample(static_cast<const kernel_dispatch_sample&>(sample));
break;
case type_identifier_t::scratch_memory:
handle_sample(static_cast<const scratch_memory_sample&>(sample));
break;
case type_identifier_t::memory_copy:
handle_sample(static_cast<const memory_copy_sample&>(sample));
break;
@@ -46,6 +46,7 @@ enum class type_identifier_t : uint32_t
amd_smi_sample = 0x0006,
cpu_freq_sample = 0x0007,
backtrace_region_sample = 0x0008,
scratch_memory = 0x0009,
fragmented_space = 0xFFFF
};
@@ -148,6 +149,83 @@ get_size(const kernel_dispatch_sample& item)
item.grid_size_z, static_cast<uint64_t>(item.stream_handle));
}
struct scratch_memory_sample : cacheable_t
{
static constexpr type_identifier_t type_identifier =
type_identifier_t::scratch_memory;
scratch_memory_sample() = default;
scratch_memory_sample(uint64_t _start_timestamp, uint64_t _end_timestamp,
uint64_t _thread_id, uint64_t _agent_id_handle,
uint64_t _queue_id_handle, int32_t _kind, int32_t _operation,
int32_t _flags, uint64_t _allocation_size,
uint64_t _correlation_id_internal,
uint64_t _correlation_id_ancestor, size_t _stream_handle)
: start_timestamp(_start_timestamp)
, end_timestamp(_end_timestamp)
, thread_id(_thread_id)
, agent_id_handle(_agent_id_handle)
, queue_id_handle(_queue_id_handle)
, kind(_kind)
, operation(_operation)
, flags(_flags)
, allocation_size(_allocation_size)
, correlation_id_internal(_correlation_id_internal)
, correlation_id_ancestor(_correlation_id_ancestor)
, stream_handle(_stream_handle)
{}
uint64_t start_timestamp;
uint64_t end_timestamp;
uint64_t thread_id;
uint64_t agent_id_handle;
uint64_t queue_id_handle;
int32_t kind;
int32_t operation;
int32_t flags;
uint64_t allocation_size;
uint64_t correlation_id_internal;
uint64_t correlation_id_ancestor;
size_t stream_handle;
};
template <>
inline void
serialize(uint8_t* buffer, const scratch_memory_sample& item)
{
utility::store_value(buffer, item.start_timestamp, item.end_timestamp, item.thread_id,
item.agent_id_handle, item.queue_id_handle, item.kind,
item.operation, item.flags, item.allocation_size,
item.correlation_id_internal, item.correlation_id_ancestor,
static_cast<uint64_t>(item.stream_handle));
}
template <>
inline scratch_memory_sample
deserialize(uint8_t*& buffer)
{
scratch_memory_sample item;
uint64_t stream_handle;
utility::parse_value(buffer, item.start_timestamp, item.end_timestamp, item.thread_id,
item.agent_id_handle, item.queue_id_handle, item.kind,
item.operation, item.flags, item.allocation_size,
item.correlation_id_internal, item.correlation_id_ancestor,
stream_handle);
item.stream_handle = stream_handle;
return item;
}
template <>
inline size_t
get_size(const scratch_memory_sample& item)
{
return utility::get_size(item.start_timestamp, item.end_timestamp, item.thread_id,
item.agent_id_handle, item.queue_id_handle, item.kind,
item.operation, item.flags, item.allocation_size,
item.correlation_id_internal, item.correlation_id_ancestor,
static_cast<uint64_t>(item.stream_handle));
}
struct memory_copy_sample : cacheable_t
{
static constexpr type_identifier_t type_identifier = type_identifier_t::memory_copy;
@@ -1272,7 +1272,10 @@ shutdown()
try
{
data::shutdown();
if(data::shutdown())
{
ROCPROFSYS_AMD_SMI_CALL(amdsmi_shut_down());
}
} catch(std::runtime_error& _e)
{
LOG_WARNING("Exception thrown when shutting down amd-smi: {}", _e.what());
@@ -24,6 +24,7 @@
#include "core/common.hpp"
#include "core/config.hpp"
#include "core/state.hpp"
#include "core/timemory.hpp"
#include "library/runtime.hpp"
#include <timemory/backends/threading.hpp>
@@ -33,9 +34,7 @@
#include "logger/debug.hpp"
#include <cstddef>
#include <cstdio>
#include <cstdlib>
#include <unistd.h>
namespace rocprofsys
{
@@ -89,26 +88,6 @@ void
exit_gotcha::operator()(const gotcha_data& _data, exit_func_t _func, int _ec) const
{
_exit_info = { true, _data.tool_id.find("quick") != std::string::npos, _ec };
if(config::get_use_amd_smi())
{
threading::clear_callbacks();
if(get_state() < ::rocprofsys::State::Finalized && !is_child_process())
{
LOG_DEBUG("Finalizing {} before calling {}({})...", get_exe_name(),
_data.tool_id, _ec);
rocprofsys_finalize();
}
LOG_DEBUG("Calling _exit({}) in {} to avoid AMD SMI cleanup issues...", _ec,
get_exe_name().c_str());
std::fflush(nullptr);
_exit(_ec);
}
invoke_exit_gotcha(_data, _func, _ec);
}
@@ -567,6 +567,18 @@ get_mem_alloc_address(
}
#endif
uint64_t
get_scratch_mem_alloc_size(
[[maybe_unused]] const rocprofiler_buffer_tracing_scratch_memory_record_t& record)
{
// Scratch memory samples from SDK versions prior to 7.0.2 do not include allocation_size
#if(ROCPROFSYS_USE_ROCM > 0 && ROCPROFSYS_ROCM_VERSION >= 70002)
return record.allocation_size;
#else
return 0;
#endif
}
void
cache_region(const rocprofiler_callback_tracing_record_t* record,
const rocprofiler_timestamp_t start_timestamp,
@@ -615,13 +627,26 @@ cache_kernel_dispatch(rocprofiler_buffer_tracing_kernel_dispatch_record_t* recor
record->dispatch_info.grid_size.z, stream_handle });
}
void
cache_scratch_memory(rocprofiler_buffer_tracing_scratch_memory_record_t* record,
uint64_t stream_handle)
{
trace_cache::get_metadata_registry().add_stream(stream_handle);
trace_cache::get_buffer_storage().store(trace_cache::scratch_memory_sample{
record->start_timestamp, record->end_timestamp, record->thread_id,
record->agent_id.handle, record->queue_id.handle,
static_cast<int32_t>(record->kind), static_cast<int32_t>(record->operation),
static_cast<int32_t>(record->flags), get_scratch_mem_alloc_size(*record),
record->correlation_id.internal, get_parent_stack_id(record->correlation_id),
stream_handle });
}
void
cache_memory_copy(rocprofiler_buffer_tracing_memory_copy_record_t* record,
uint64_t stream_handle)
{
trace_cache::get_metadata_registry().add_stream(stream_handle);
trace_cache::get_buffer_storage().store(trace_cache::memory_copy_sample{
record->start_timestamp, record->end_timestamp, record->thread_id,
record->dst_agent_id.handle, record->src_agent_id.handle,
static_cast<int32_t>(record->kind), static_cast<int32_t>(record->operation),
@@ -1759,6 +1784,120 @@ tool_tracing_buffered(rocprofiler_context_id_t /*context*/,
}
}
}
else if(header->kind == ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY)
{
auto* record =
static_cast<rocprofiler_buffer_tracing_scratch_memory_record_t*>(
header->payload);
bool _group_by_queue = _default_group_by_queue;
const auto* agent = tool_data->get_gpu_tool_agent(record->agent_id);
auto device_id = static_cast<uint32_t>(agent->device_id);
const auto& t_info = thread_info::get(record->thread_id, SystemTID);
auto thread_id_sequent = t_info->index_data->sequent_value;
auto _corr_id = record->correlation_id.internal;
auto _beg_ns = record->start_timestamp;
auto _end_ns = record->end_timestamp;
auto _name =
tool_data->buffered_tracing_info.at(record->kind, record->operation);
auto _stream_id = get_stream_id(record).handle;
if(_stream_id == 0)
{
// Scratch memory event is not associated with a HIP stream
_group_by_queue = true;
}
{
auto track_name = JOIN("", "GPU Scratch Memory [", device_id,
"] Thread ", record->thread_id);
cache_category<category::rocm_scratch_memory>();
cache_add_thread_info(record->thread_id);
cache_add_track(track_name.c_str(), record->thread_id);
cache_scratch_memory(record, _stream_id);
}
if(get_use_timemory())
{
auto _bundle = kernel_dispatch_bundle_t{ _name };
_bundle.push(thread_id_sequent).start().stop();
_bundle.get([_beg_ns, _end_ns](tim::component::wall_clock* _wc) {
_wc->set_value(_end_ns - _beg_ns);
_wc->set_accum(_end_ns - _beg_ns);
});
_bundle.pop();
}
if(get_use_perfetto())
{
// Scratch memory samples from SDK versions prior to 7.0.2 do not include
// allocation_size field, so counter tracks are not needed
#if(ROCPROFSYS_USE_ROCM > 0 && ROCPROFSYS_ROCM_VERSION >= 70002)
using counter_track = perfetto_counter_track<
rocprofiler_buffer_tracing_scratch_memory_record_t>;
if(!counter_track::exists(device_id))
{
auto track_name_alloc_size =
JOIN("", "GPU Scratch Memory [", device_id, "] (S) Thread ",
thread_id_sequent);
counter_track::emplace(device_id, track_name_alloc_size, "bytes");
}
if(record->operation == ROCPROFILER_SCRATCH_MEMORY_ALLOC)
{
TRACE_COUNTER("rocm_scratch_memory",
counter_track::at(device_id, 0), _beg_ns,
record->allocation_size);
}
#endif
auto add_perfetto_annotations = [&](::perfetto::EventContext ctx) {
if(config::get_perfetto_annotations())
{
tracing::add_perfetto_annotation(ctx, "begin_ns", _beg_ns);
tracing::add_perfetto_annotation(ctx, "end_ns", _end_ns);
tracing::add_perfetto_annotation(ctx, "corr_id", _corr_id);
tracing::add_perfetto_annotation(ctx, "stream_id",
_stream_id);
}
};
if(_group_by_queue)
{
auto track_name_events = [&]() {
return JOIN("", "GPU Scratch Memory (S) Events Thread ",
thread_id_sequent);
};
const auto _track = tracing::get_perfetto_track(
category::rocm_scratch_memory{}, track_name_events);
tracing::push_perfetto(category::rocm_scratch_memory{},
_name.data(), _track, _beg_ns,
::perfetto::Flow::ProcessScoped(_corr_id),
add_perfetto_annotations);
tracing::pop_perfetto(category::rocm_scratch_memory{}, "", _track,
_end_ns);
}
else
{
const auto _track = tracing::get_perfetto_track(
category::rocm_hip_stream{}, _track_desc_stream, _stream_id);
tracing::push_perfetto(category::rocm_hip_stream{}, _name.data(),
_track, _beg_ns,
::perfetto::Flow::ProcessScoped(_corr_id),
add_perfetto_annotations);
tracing::pop_perfetto(category::rocm_hip_stream{}, "", _track,
_end_ns);
}
}
}
else if(header->kind == ROCPROFILER_BUFFER_TRACING_MEMORY_COPY)
{
auto* record =
@@ -2249,6 +2388,17 @@ tool_init(rocprofiler_client_finalize_t fini_func, void* user_data)
_data->primary_ctx, ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, nullptr, 0,
_data->memory_copy_buffer));
}
if(_buffered_domain.count(ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY) > 0)
{
ROCPROFILER_CALL(rocprofiler_create_buffer(
_data->primary_ctx, buffer_size, watermark,
ROCPROFILER_BUFFER_POLICY_LOSSLESS, tool_tracing_buffered, tool_data,
&_data->scratch_memory_buffer));
ROCPROFILER_CALL(rocprofiler_configure_buffer_tracing_service(
_data->primary_ctx, ROCPROFILER_BUFFER_TRACING_SCRATCH_MEMORY, nullptr, 0,
_data->scratch_memory_buffer));
}
#if(ROCPROFILER_VERSION >= 600)
if(_buffered_domain.count(ROCPROFILER_BUFFER_TRACING_MEMORY_ALLOCATION) > 0)
@@ -122,7 +122,7 @@ using backtrace_operation_map_t =
struct client_data
{
static constexpr size_t num_buffers = 4;
static constexpr size_t num_buffers = 5;
static constexpr size_t num_contexts = 2;
using buffer_name_info_t = rocprofiler::sdk::buffer_name_info_t<std::string_view>;
@@ -138,6 +138,7 @@ struct client_data
rocprofiler_context_id_t primary_ctx = { 0 };
rocprofiler_context_id_t counter_ctx = { 0 };
rocprofiler_buffer_id_t kernel_dispatch_buffer = { 0 };
rocprofiler_buffer_id_t scratch_memory_buffer = { 0 };
rocprofiler_buffer_id_t memory_copy_buffer = { 0 };
rocprofiler_buffer_id_t memory_alloc_buffer = { 0 };
rocprofiler_buffer_id_t counter_collection_buffer = { 0 };
@@ -179,12 +180,9 @@ client_data::get_contexts() const
inline client_data::buffer_id_vec_t
client_data::get_buffers() const
{
return buffer_id_vec_t{
kernel_dispatch_buffer,
memory_copy_buffer,
memory_alloc_buffer,
counter_collection_buffer,
};
return buffer_id_vec_t{ kernel_dispatch_buffer, scratch_memory_buffer,
memory_copy_buffer, memory_alloc_buffer,
counter_collection_buffer };
}
inline const rocprofsys_agent_t*
@@ -47,7 +47,6 @@ from .parser import parse_uploaded_file
from .parser import find_causal_files
import plotly.graph_objects as go
file_timestamp = 0
global_data = pd.DataFrame()
global_samples = pd.DataFrame()
@@ -37,7 +37,6 @@ from . import libpyrocprofsys
from .libpyrocprofsys.profiler import profiler_init as _profiler_init
from .libpyrocprofsys.profiler import profiler_finalize as _profiler_fini
__all__ = ["exec_", "_file", "_get_argv", "_initialize", "_finalize"]
@@ -44,7 +44,6 @@ from .libpyrocprofsys.user import pop_region
from .common import _initialize
from .common import _file
__all__ = [
"region",
"Region",

Sommige bestanden werden niet getoond omdat er teveel bestanden zijn veranderd in deze diff Meer weergeven