Update to use rocprofiler-sdk (#55)

- Renames the CMake option "ROCPROFSYS_USE_HIP" to "ROCPROFSYS_USE_ROCM"
- Remove the "ROCPROFSYS_USE_ROCM_SMI option. Controlled with the "ROCPROFSYS_USE_ROCM" option, instead.
   - Runtime configuration can still toggle ROCPROFSYS_USE_ROCM_SMI to disable the sampling.
- Rename ROCPROFSYS_HIP_VERSION macro to ROCPROFSYS_ROCM_VERSION and remove blocks for `ROCPROFSYS_ROCM_VERSION < 60000`
- Remove ROCPROFSYS_USE_ROCTRACER and ROCPROFSYS_USE_ROCPROFILER
- Update test cases
- Update docker files and workflows to install cmake 3.21, which is required for the rocprofiler-sdk findPackage script.
- Removed rocm-6.2 from workflows due to a rocprofiler-sdk API change. 

[ROCm/rocprofiler-systems commit: 88aa2d3cbe]
Этот коммит содержится в:
David Galiffi
2024-12-13 18:48:39 -05:00
коммит произвёл GitHub
родитель 417d22ee3e
Коммит b29cfac106
87 изменённых файлов: 3842 добавлений и 6261 удалений
+69 -349
Просмотреть файл
@@ -20,22 +20,19 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#define ROCPROFILER_SDK_CEREAL_NAMESPACE_BEGIN \
namespace tim \
{ \
namespace cereal \
{
#define ROCPROFILER_SDK_CEREAL_NAMESPACE_END \
} \
} // namespace ::tim::cereal
#include "common/defines.h"
#if !defined(ROCPROFSYS_USE_ROCM_SMI)
# define ROCPROFSYS_USE_ROCM_SMI 0
#endif
#if !defined(ROCPROFSYS_USE_HIP)
# define ROCPROFSYS_USE_HIP 0
#endif
#include "core/hip_runtime.hpp"
#if ROCPROFSYS_USE_HIP > 0
# if !defined(TIMEMORY_USE_HIP)
# define TIMEMORY_USE_HIP 1
# endif
#if !defined(ROCPROFSYS_USE_ROCM)
# define ROCPROFSYS_USE_ROCM 0
#endif
#include "debug.hpp"
@@ -44,24 +41,11 @@
#include <timemory/manager.hpp>
#if ROCPROFSYS_USE_ROCM_SMI > 0
#if ROCPROFSYS_USE_ROCM > 0
# include <rocm_smi/rocm_smi.h>
#endif
#if ROCPROFSYS_USE_HIP > 0
# include <timemory/components/hip/backends.hpp>
# if !defined(ROCPROFSYS_HIP_RUNTIME_CALL)
# define ROCPROFSYS_HIP_RUNTIME_CALL(err) \
{ \
if(err != ::tim::hip::success_v && (int) err != 0) \
{ \
ROCPROFSYS_THROW( \
"[%s:%d] Warning! HIP API call failed with code %i :: %s\n", \
__FILE__, __LINE__, (int) err, hipGetErrorString(err)); \
} \
}
# endif
# include <rocprofiler-sdk/agent.h>
# include <rocprofiler-sdk/cxx/serialization.hpp>
# include <rocprofiler-sdk/fwd.h>
#endif
namespace rocprofsys
@@ -70,9 +54,7 @@ namespace gpu
{
namespace
{
namespace scope = ::tim::scope;
#if ROCPROFSYS_USE_ROCM_SMI > 0
#if ROCPROFSYS_USE_ROCM > 0
# define ROCPROFSYS_ROCM_SMI_CALL(ERROR_CODE) \
::rocprofsys::gpu::check_rsmi_error(ERROR_CODE, __FILE__, __LINE__)
@@ -108,99 +90,47 @@ rsmi_init()
return _rsmi_init;
}
#endif
#endif // ROCPROFSYS_USE_ROCM > 0
#if ROCPROFSYS_HIP_VERSION >= 60000
template <typename ArchiveT, typename ArgT,
std::enable_if_t<!std::is_pointer<ArgT>::value, int> = 0>
void
device_prop_serialize(ArchiveT& archive, const char* name, const ArgT& arg)
int32_t
query_rocm_gpu_agents()
{
namespace cereal = tim::cereal;
using cereal::make_nvp;
archive(make_nvp(name, arg));
}
template <typename ArchiveT, typename ArgT, size_t N>
void
device_prop_serialize(ArchiveT& archive, const char* name, ArgT arg[N])
{
if constexpr(!std::is_same<ArgT, char>::value &&
!std::is_same<ArgT, const char>::value)
{
namespace cereal = tim::cereal;
using cereal::make_nvp;
auto data = std::array<int, N>{};
for(size_t i = 0; i < N; ++i)
data[i] = arg[i];
archive(make_nvp(name, data));
}
else
{
device_prop_serialize(archive, name, std::string{ arg });
}
}
template <typename ArchiveT>
void
device_prop_serialize(ArchiveT& archive, const char* name, hipUUID_t arg)
{
constexpr auto N = sizeof(arg.bytes);
namespace cereal = tim::cereal;
using cereal::make_nvp;
auto data = std::array<char, N + 1>{};
data.fill('\0');
for(size_t i = 0; i < N; ++i)
data[i] = arg.bytes[i];
auto str_v = std::string_view{ data.data() };
auto str = std::string{ str_v }.substr(0, str_v.find('\0'));
archive(make_nvp(name, str));
}
template <typename ArchiveT>
void
device_prop_serialize(ArchiveT& archive, const char* name, hipDeviceArch_t arg)
{
namespace cereal = tim::cereal;
using cereal::make_nvp;
# define ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(NAME) \
{ \
auto val = arg.NAME; \
archive(make_nvp(#NAME, val)); \
int32_t _dev_cnt = 0;
#if ROCPROFSYS_USE_ROCM > 0
auto iterator = [](rocprofiler_agent_version_t /*version*/, const void** agents,
size_t num_agents, void* user_data) -> rocprofiler_status_t {
auto* _cnt = static_cast<int32_t*>(user_data);
for(size_t i = 0; i < num_agents; ++i)
{
const auto* _agent = static_cast<const rocprofiler_agent_v0_t*>(agents[i]);
if(_agent && _agent->type == ROCPROFILER_AGENT_TYPE_GPU) *_cnt += 1;
}
return ROCPROFILER_STATUS_SUCCESS;
};
archive.setNextName(name);
archive.startNode();
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasGlobalInt32Atomics)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasGlobalFloatAtomicExch)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasSharedInt32Atomics)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasSharedFloatAtomicExch)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasFloatAtomicAdd)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasGlobalInt64Atomics)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasSharedInt64Atomics)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasDoubles)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasWarpVote)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasWarpBallot)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasWarpShuffle)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasFunnelShift)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasThreadFenceSystem)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasSyncThreadsExt)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasSurfaceFuncs)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(has3dGrid)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH(hasDynamicParallelism)
archive.finishNode();
# undef ROCPROFSYS_SERIALIZE_HIP_DEVICE_ARCH
}
try
{
rocprofiler_query_available_agents(ROCPROFILER_AGENT_INFO_VERSION_0, iterator,
sizeof(rocprofiler_agent_v0_t), &_dev_cnt);
} catch(std::exception& _e)
{
ROCPROFSYS_BASIC_VERBOSE(
1, "Exception thrown getting the rocm agents: %s. _dev_cnt=%d\n", _e.what(),
_dev_cnt);
}
// rocprofiler_query_available_agents(ROCPROFILER_AGENT_INFO_VERSION_0, iterator,
// sizeof(rocprofiler_agent_v0_t), &_dev_cnt);
#endif
return _dev_cnt;
}
} // namespace
int
hip_device_count()
rocm_device_count()
{
#if ROCPROFSYS_USE_HIP > 0
return ::tim::hip::device_count();
#if ROCPROFSYS_USE_ROCM > 0
static int _num_devices = query_rocm_gpu_agents();
return _num_devices;
#else
return 0;
#endif
@@ -209,7 +139,7 @@ hip_device_count()
int
rsmi_device_count()
{
#if ROCPROFSYS_USE_ROCM_SMI > 0
#if ROCPROFSYS_USE_ROCM > 0
if(!rsmi_init()) return 0;
static auto _num_devices = []() {
@@ -234,11 +164,8 @@ rsmi_device_count()
int
device_count()
{
#if ROCPROFSYS_USE_ROCM_SMI > 0
// store as static since calls after rsmi_shutdown will return zero
return rsmi_device_count();
#elif ROCPROFSYS_USE_HIP > 0
return ::tim::hip::device_count();
#if ROCPROFSYS_USE_ROCM > 0
return rocm_device_count();
#else
return 0;
#endif
@@ -246,251 +173,44 @@ device_count()
template <typename ArchiveT>
void
add_hip_device_metadata(ArchiveT& ar)
add_device_metadata(ArchiveT& ar)
{
namespace cereal = tim::cereal;
using cereal::make_nvp;
#if ROCPROFSYS_USE_HIP > 0
int _device_count = 0;
int _current_device = 0;
hipError_t _device_count_err = hipGetDeviceCount(&_device_count);
#if ROCPROFSYS_USE_ROCM > 0
using agent_vec_t = std::vector<rocprofiler_agent_v0_t>;
if(_device_count_err != hipSuccess) return;
hipError_t _current_device_err = hipGetDevice(&_current_device);
scope::destructor _dtor{ [_current_device, _current_device_err]() {
if(_current_device_err == hipSuccess)
auto _agents_vec = agent_vec_t{};
auto iterator = [](rocprofiler_agent_version_t /*version*/, const void** agents,
size_t num_agents, void* user_data) -> rocprofiler_status_t {
auto* _agents_vec_v = static_cast<agent_vec_t*>(user_data);
_agents_vec_v->reserve(num_agents);
for(size_t i = 0; i < num_agents; ++i)
{
ROCPROFSYS_HIP_RUNTIME_CALL(hipSetDevice(_current_device));
const auto* _agent = static_cast<const rocprofiler_agent_v0_t*>(agents[i]);
if(_agent) _agents_vec_v->emplace_back(*_agent);
}
} };
return ROCPROFILER_STATUS_SUCCESS;
};
rocprofiler_query_available_agents(ROCPROFILER_AGENT_INFO_VERSION_0, iterator,
sizeof(rocprofiler_agent_v0_t), &_agents_vec);
if(_current_device_err != hipSuccess || _device_count == 0) return;
ar.setNextName("hip_device_properties");
ar.startNode();
ar.makeArray();
scope::destructor _prop_dtor{ [&ar]() { ar.finishNode(); } };
for(int dev = 0; dev < _device_count; ++dev)
{
auto _device_prop = hipDeviceProp_t{};
int _driver_version = 0;
int _runtime_version = 0;
ROCPROFSYS_HIP_RUNTIME_CALL(hipSetDevice(dev));
ROCPROFSYS_HIP_RUNTIME_CALL(hipGetDeviceProperties(&_device_prop, dev));
ROCPROFSYS_HIP_RUNTIME_CALL(hipDriverGetVersion(&_driver_version));
ROCPROFSYS_HIP_RUNTIME_CALL(hipRuntimeGetVersion(&_runtime_version));
ar.startNode();
# if ROCPROFSYS_HIP_VERSION < 60000
using intvec_t = std::vector<int>;
# define ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(NAME) \
ar(make_nvp(#NAME, _device_prop.NAME));
# define ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP_ARRAY(NAME, ...) \
ar(make_nvp(NAME, __VA_ARGS__));
ar(make_nvp("name", std::string{ _device_prop.name }));
ar(make_nvp("driver_version", _driver_version));
ar(make_nvp("runtime_version", _runtime_version));
ar(make_nvp("capability.major_version", _device_prop.major));
ar(make_nvp("capability.minor_version", _device_prop.minor));
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(totalGlobalMem)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(totalConstMem)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(clockRate)
# if ROCPROFSYS_HIP_VERSION >= 50000
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(memoryClockRate)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(memoryBusWidth)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(l2CacheSize)
# endif
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(sharedMemPerBlock)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(regsPerBlock)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(warpSize)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(multiProcessorCount)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxThreadsPerMultiProcessor)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxThreadsPerBlock)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP_ARRAY(
"maxThreadsDim",
intvec_t{ _device_prop.maxThreadsDim[0], _device_prop.maxThreadsDim[1],
_device_prop.maxThreadsDim[2] })
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP_ARRAY(
"maxGridSize",
intvec_t{ _device_prop.maxGridSize[0], _device_prop.maxGridSize[1],
_device_prop.maxGridSize[2] })
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(memPitch)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(textureAlignment)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(kernelExecTimeoutEnabled)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(integrated)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(canMapHostMemory)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(ECCEnabled)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(cooperativeLaunch)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(cooperativeMultiDeviceLaunch)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pciDomainID)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pciBusID)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pciDeviceID)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(computeMode)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(gcnArch)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(gcnArchName)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(isMultiGpuBoard)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(clockInstructionRate)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pageableMemoryAccess)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pageableMemoryAccessUsesHostPageTables)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(directManagedMemAccessFromHost)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(concurrentManagedAccess)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(concurrentKernels)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSharedMemoryPerMultiProcessor)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(asicRevision)
# else
# define ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(NAME) \
device_prop_serialize(ar, #NAME, _device_prop.NAME);
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(name)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(uuid)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(luid)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(luidDeviceNodeMask)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(totalGlobalMem)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(sharedMemPerBlock)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(regsPerBlock)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(warpSize)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(memPitch)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxThreadsPerBlock)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxThreadsDim)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxGridSize)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(clockRate)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(totalConstMem)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(major)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(minor)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(textureAlignment)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(texturePitchAlignment)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(deviceOverlap)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(multiProcessorCount)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(kernelExecTimeoutEnabled)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(integrated)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(canMapHostMemory)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(computeMode)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture1D)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture1DMipmap)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture1DLinear)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture2D)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture2DMipmap)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture2DLinear)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture2DGather)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture3D)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture3DAlt)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTextureCubemap)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture1DLayered)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTexture2DLayered)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxTextureCubemapLayered)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSurface1D)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSurface2D)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSurface3D)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSurface1DLayered)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSurface2DLayered)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSurfaceCubemap)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSurfaceCubemapLayered)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(surfaceAlignment)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(concurrentKernels)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(ECCEnabled)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pciBusID)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pciDeviceID)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pciDomainID)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(tccDriver)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(asyncEngineCount)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(unifiedAddressing)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(memoryClockRate)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(memoryBusWidth)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(l2CacheSize)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(persistingL2CacheMaxSize)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxThreadsPerMultiProcessor)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(streamPrioritiesSupported)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(globalL1CacheSupported)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(localL1CacheSupported)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(sharedMemPerMultiprocessor)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(regsPerMultiprocessor)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(managedMemory)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(isMultiGpuBoard)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(multiGpuBoardGroupID)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(hostNativeAtomicSupported)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(singleToDoublePrecisionPerfRatio)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pageableMemoryAccess)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(concurrentManagedAccess)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(computePreemptionSupported)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(canUseHostPointerForRegisteredMem)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(cooperativeLaunch)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(cooperativeMultiDeviceLaunch)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(sharedMemPerBlockOptin)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(pageableMemoryAccessUsesHostPageTables)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(directManagedMemAccessFromHost)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxBlocksPerMultiProcessor)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(accessPolicyMaxWindowSize)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(reservedSharedMemPerBlock)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(hostRegisterSupported)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(sparseHipArraySupported)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(hostRegisterReadOnlySupported)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(timelineSemaphoreInteropSupported)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(memoryPoolsSupported)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(gpuDirectRDMASupported)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(gpuDirectRDMAFlushWritesOptions)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(gpuDirectRDMAWritesOrdering)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(memoryPoolSupportedHandleTypes)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(deferredMappingHipArraySupported)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(ipcEventSupported)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(clusterLaunch)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(unifiedFunctionPointers)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(gcnArchName)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(maxSharedMemoryPerMultiProcessor)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(clockInstructionRate)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(arch)
// ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(hdpMemFlushCntl)
// ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(hdpRegFlushCntl)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(cooperativeMultiDeviceUnmatchedFunc)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(cooperativeMultiDeviceUnmatchedGridDim)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(cooperativeMultiDeviceUnmatchedBlockDim)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(cooperativeMultiDeviceUnmatchedSharedMem)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(isLargeBar)
ROCPROFSYS_SERIALIZE_HIP_DEVICE_PROP(asicRevision)
# endif
const auto _compute_mode_descr = std::array<const char*, 6>{
"Default (multiple host threads can use ::hipSetDevice() with device "
"simultaneously)",
"Exclusive (only one host thread in one process is able to use "
"::hipSetDevice() with this device)",
"Prohibited (no host thread can use ::hipSetDevice() with this device)",
"Exclusive Process (many threads in one process is able to use "
"::hipSetDevice() with this device)",
"Unknown",
nullptr
};
auto _compute_mode = std::min<int>(_device_prop.computeMode, 5);
ar(make_nvp("computeModeDescription",
std::string{ _compute_mode_descr.at(_compute_mode) }));
ar.finishNode();
}
ar(make_nvp("rocm_agents", _agents_vec));
#else
(void) ar;
#endif
}
void
add_hip_device_metadata()
add_device_metadata()
{
if(device_count() == 0) return;
ROCPROFSYS_METADATA([](auto& ar) {
try
{
add_hip_device_metadata(ar);
add_device_metadata(ar);
} catch(std::runtime_error& _e)
{
ROCPROFSYS_VERBOSE(2, "%s\n", _e.what());