diff --git a/projects/rocprofiler-compute/src/hip/hip.py b/projects/rocprofiler-compute/src/hip/hip.py new file mode 100644 index 0000000000..04af444127 --- /dev/null +++ b/projects/rocprofiler-compute/src/hip/hip.py @@ -0,0 +1,453 @@ +############################################################################## +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +############################################################################## + +import ctypes +from ctypes import ( + POINTER, + Structure, + byref, + c_char, + c_char_p, + c_float, + c_int, + c_size_t, + c_uint, + c_uint8, + c_void_p, +) + +_lib = ctypes.CDLL("libamdhip64.so") + + +# Mirrors struct hipUUID_t +class HIPUUID(Structure): + _fields_ = [ + ("uuid", c_uint8 * 16), + ] + + +# Mirrors hipDeviceArch_t +class HIPDeviceArch(Structure): + _fields_ = [ + # 32-bit Atomics + ("hasGlobalInt32Atomics", c_uint, 1), + ("hasGlobalFloatAtomicExch", c_uint, 1), + ("hasSharedInt32Atomics", c_uint, 1), + ("hasSharedFloatAtomicExch", c_uint, 1), + ("hasFloatAtomicAdd", c_uint, 1), + # 64-bit Atomics + ("hasGlobalInt64Atomics", c_uint, 1), + ("hasSharedInt64Atomics", c_uint, 1), + # Doubles + ("hasDoubles", c_uint, 1), + # Warp cross-lane operations + ("hasWarpVote", c_uint, 1), + ("hasWarpBallot", c_uint, 1), + ("hasWarpShuffle", c_uint, 1), + ("hasFunnelShift", c_uint, 1), + # Sync + ("hasThreadFenceSystem", c_uint, 1), + ("hasSyncThreadsExt", c_uint, 1), + # Misc + ("hasSurfaceFuncs", c_uint, 1), + ("has3dGrid", c_uint, 1), + ("hasDynamicParallelism", c_uint, 1), + ] + + +# Mirrors hipDeviceProp_tR0600 +class HIPDeviceProperties(Structure): + _fields_ = [ + ("name_str", c_char * 256), + ("uuid", HIPUUID), + ("luid", c_char * 8), + ("luidDeviceNodeMask", c_uint), + ("totalGlobalMem", c_size_t), + ("sharedMemPerBlock", c_size_t), + ("regsPerBlock", c_int), + ("warpSize", c_int), + ("memPitch", c_size_t), + ("maxThreadsPerBlock", c_int), + ("maxThreadsDim", c_int * 3), + ("maxGridSize", c_int * 3), + ("clockRate", c_int), + ("totalConstMem", c_size_t), + ("major", c_int), + ("minor", c_int), + ("textureAlignment", c_size_t), + ("texturePitchAlignment", c_size_t), + ("deviceOverlap", c_int), + ("multiProcessorCount", c_int), + ("kernelExecTimeoutEnabled", c_int), + ("integrated", c_int), + ("canMapHostMemory", c_int), + ("computeMode", c_int), + ("maxTexture1D", c_int), + ("maxTexture1DMipmap", c_int), + ("maxTexture1DLinear", c_int), + ("maxTexture2D", c_int * 2), + ("maxTexture2DMipmap", c_int * 2), + ("maxTexture2DLinear", c_int * 3), + ("maxTexture2DGather", c_int * 2), + ("maxTexture3D", c_int * 3), + ("maxTexture3DAlt", c_int * 3), + ("maxTextureCubemap", c_int), + ("maxTexture1DLayered", c_int * 2), + ("maxTexture2DLayered", c_int * 3), + ("maxTextureCubemapLayered", c_int * 2), + ("maxSurface1D", c_int), + ("maxSurface2D", c_int * 2), + ("maxSurface3D", c_int * 3), + ("maxSurface1DLayered", c_int * 2), + ("maxSurface2DLayered", c_int * 3), + ("maxSurfaceCubemap", c_int), + ("maxSurfaceCubemapLayered", c_int * 2), + ("surfaceAlignment", c_size_t), + ("concurrentKernels", c_int), + ("ECCEnabled", c_int), + ("pciBusID", c_int), + ("pciDeviceID", c_int), + ("pciDomainID", c_int), + ("tccDriver", c_int), + ("asyncEngineCount", c_int), + ("unifiedAddressing", c_int), + ("memoryClockRate", c_int), + ("memoryBusWidth", c_int), + ("l2CacheSize", c_int), + ("persistingL2CacheMaxSize", c_int), + ("maxThreadsPerMultiProcessor", c_int), + ("streamPrioritiesSupported", c_int), + ("globalL1CacheSupported", c_int), + ("localL1CacheSupported", c_int), + ("sharedMemPerMultiprocessor", c_size_t), + ("regsPerMultiprocessor", c_int), + ("managedMemory", c_int), + ("isMultiGpuBoard", c_int), + ("multiGpuBoardGroupID", c_int), + ("hostNativeAtomicSupported", c_int), + ("singleToDoublePrecisionPerfRatio", c_int), + ("pageableMemoryAccess", c_int), + ("concurrentManagedAccess", c_int), + ("computePreemptionSupported", c_int), + ("canUseHostPointerForRegisteredMem", c_int), + ("cooperativeLaunch", c_int), + ("cooperativeMultiDeviceLaunch", c_int), + ("sharedMemPerBlockOptin", c_size_t), + ("pageableMemoryAccessUsesHostPageTables", c_int), + ("directManagedMemAccessFromHost", c_int), + ("maxBlocksPerMultiProcessor", c_int), + ("accessPolicyMaxWindowSize", c_int), + ("reservedSharedMemPerBlock", c_size_t), + ("hostRegisterSupported", c_int), + ("sparseHipArraySupported", c_int), + ("hostRegisterReadOnlySupported", c_int), + ("timelineSemaphoreInteropSupported", c_int), + ("memoryPoolsSupported", c_int), + ("gpuDirectRDMASupported", c_int), + ("gpuDirectRDMAFlushWritesOptions", c_uint), + ("gpuDirectRDMAWritesOrdering", c_int), + ("memoryPoolSupportedHandleTypes", c_uint), + ("deferredMappingHipArraySupported", c_int), + ("ipcEventSupported", c_int), + ("clusterLaunch", c_int), + ("unifiedFunctionPointers", c_int), + ("reserved", c_int * 63), + ("hipReserved", c_int * 32), + # HIP-only + ("gcnArchName_str", c_char * 256), + ("maxSharedMemoryPerMultiProcessor", c_size_t), + ("clockInstructionRate", c_int), + ("arch", HIPDeviceArch), + ("hdpMemFlushCntl", POINTER(c_uint)), + ("hdpRegFlushCntl", POINTER(c_uint)), + ("cooperativeMultiDeviceUnmatchedFunc", c_int), + ("cooperativeMultiDeviceUnmatchedGridDim", c_int), + ("cooperativeMultiDeviceUnmatchedBlockDim", c_int), + ("cooperativeMultiDeviceUnmatchedSharedMem", c_int), + ("isLargeBar", c_int), + ("asicRevision", c_int), + ] + + # Add properties as needed + @property + def name(self) -> str: + return self.name_str.decode("utf-8") + + @property + def gcnArchName(self) -> str: + return self.gcnArchName_str.decode("utf-8") + + +# Declare HIP functions here +_lib.hipGetDeviceCount.restype = c_int +_lib.hipGetDeviceCount.argtypes = [POINTER(c_int)] + +_lib.hipGetDevicePropertiesR0600.restype = c_int +_lib.hipGetDevicePropertiesR0600.argtypes = [POINTER(HIPDeviceProperties), c_int] + +_lib.hipMalloc.restype = c_int +_lib.hipMalloc.argtypes = [POINTER(c_void_p), c_size_t] + +_lib.hipFree.restype = c_int +_lib.hipFree.argtypes = [c_void_p] + +_lib.hipMemcpyHtoD.restype = c_int +_lib.hipMemcpyHtoD.argtypes = [c_void_p, c_void_p, c_size_t] + +_lib.hipMemcpyDtoH.restype = c_int +_lib.hipMemcpyDtoH.argtypes = [c_void_p, c_void_p, c_size_t] + +_lib.hipSetDevice.restype = c_int +_lib.hipSetDevice.argtypes = [c_int] + +_lib.hipModuleLoadData.restype = c_int +_lib.hipModuleLoadData.argtypes = [POINTER(c_void_p), c_char_p] + +_lib.hipModuleUnload.restype = c_int +_lib.hipModuleUnload.argtypes = [c_void_p] + +_lib.hipModuleGetFunction.restype = c_int +_lib.hipModuleGetFunction.argtypes = [POINTER(c_void_p), c_void_p, c_char_p] + +_lib.hipDeviceSynchronize.restype = c_int +_lib.hipDeviceSynchronize.argtypes = [] + +_lib.hipModuleLaunchKernel.restype = c_int +_lib.hipModuleLaunchKernel.argtypes = [ + c_void_p, + c_uint, + c_uint, + c_uint, + c_uint, + c_uint, + c_uint, + c_uint, + c_void_p, + POINTER(c_void_p), + POINTER(c_void_p), +] + +_lib.hipEventCreate.restype = c_int +_lib.hipEventCreate.argtypes = [POINTER(c_void_p)] + +_lib.hipEventDestroy.restype = c_int +_lib.hipEventDestroy.argtypes = [c_void_p] + +_lib.hipEventRecord.restype = c_int +_lib.hipEventRecord.argtypes = [c_void_p, c_void_p] + +_lib.hipEventElapsedTime.restype = c_int +_lib.hipEventElapsedTime.argtypes = [POINTER(c_float), c_void_p, c_void_p] + + +class HIPError(Exception): + def __init__(self, code: int) -> None: + self.code = code + self.message = f"HIP Error {self.code}" + + def __str__(self) -> str: + return self.message + + +class HIPDeviceMemory: + def __init__(self, ptr: POINTER) -> None: + self.ptr = ptr + + def __del__(self) -> None: + _lib.hipFree(self.ptr) + + +class HIPEvent: + def __init__(self, handle: POINTER) -> None: + self.handle = handle + + def __del__(self) -> None: + _lib.hipEventDestroy(self.handle) + + +class HIPModule: + def __init__(self, handle: POINTER) -> None: + self.handle = handle + + def __del__(self) -> None: + _lib.hipModuleUnload(self.handle) + + +# Implement HIP functions here + + +def hipGetDeviceCount() -> int: + + device_count = c_int() + status = _lib.hipGetDeviceCount(byref(device_count)) + + if status != 0: + raise HIPError(status) + + return device_count.value + + +def hipGetDeviceProperties(device_id: int) -> HIPDeviceProperties: + + props = HIPDeviceProperties() + res = _lib.hipGetDevicePropertiesR0600(byref(props), device_id) + + if res != 0: + raise HIPError(res) + + return props + + +def hipMalloc(size: int) -> HIPDeviceMemory: + + buf_size = c_size_t(size) + ptr = c_void_p() + + status = _lib.hipMalloc(byref(ptr), buf_size) + + if status != 0: + raise HIPError(status) + + return HIPDeviceMemory(ptr) + + +def hipMemcpyHtoD(dst: HIPDeviceMemory, src: POINTER, size: int) -> None: + + res = _lib.hipMemcpyHtoD(dst.ptr, src, size) + + if res != 0: + raise HIPError(res) + + +def hipMemcpyDtoH(dst: POINTER, src: HIPDeviceMemory, size: int) -> None: + + res = _lib.hipMemcpyDtoH(dst, src.ptr, size) + + if res != 0: + raise HIPError(res) + + +def hipSetDevice(id: int) -> None: + + status = _lib.hipSetDevice(id) + + if status != 0: + raise HIPError(status) + + +def hipDeviceSynchronize() -> None: + + res = _lib.hipDeviceSynchronize() + + if res != 0: + raise HIPError(res) + + +def hipModuleLoadData(code: POINTER) -> HIPModule: + + module = c_void_p() + res = _lib.hipModuleLoadData(byref(module), code) + + if res != 0: + raise HIPError(res) + + return HIPModule(module) + + +def hipModuleGetFunction(module: POINTER, name: str) -> POINTER: + + name_bytes = name.encode("utf-8") + func = c_void_p() + + res = _lib.hipModuleGetFunction(byref(func), module.handle, name_bytes) + + if res != 0: + raise HIPError(res) + + return func + + +def hipModuleLaunchKernel( + func: POINTER, + grid_dim_x: int, + grid_dim_y: int, + grid_dim_z: int, + block_dim_x: int, + block_dim_y: int, + block_dim_z: int, + shared_mem_size: int, + stream: POINTER, + kernel_params: POINTER, + extra: POINTER = None, +) -> None: + + res = _lib.hipModuleLaunchKernel( + func, + grid_dim_x, + grid_dim_y, + grid_dim_z, + block_dim_x, + block_dim_y, + block_dim_z, + shared_mem_size, + stream, + kernel_params, + extra, + ) + + if res != 0: + raise HIPError(res) + + +def hipEventCreate() -> HIPEvent: + + handle = c_void_p() + + res = _lib.hipEventCreate(byref(handle)) + + if res != 0: + raise HIPError(res) + + return HIPEvent(handle) + + +def hipEventRecord(event: HIPEvent, stream: POINTER = None) -> None: + + res = _lib.hipEventRecord(event.handle, stream) + + if res != 0: + raise HIPError(res) + + +def hipEventElapsedTime(start: HIPEvent, stop: HIPEvent) -> float: + + ms = c_float() + + res = _lib.hipEventElapsedTime(byref(ms), start.handle, stop.handle) + + if res != 0: + raise HIPError(res) + + return ms.value diff --git a/projects/rocprofiler-compute/src/hip/hiprtc.py b/projects/rocprofiler-compute/src/hip/hiprtc.py new file mode 100644 index 0000000000..a0e33136eb --- /dev/null +++ b/projects/rocprofiler-compute/src/hip/hiprtc.py @@ -0,0 +1,205 @@ +############################################################################## +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +############################################################################## + +import ctypes +from ctypes import ( + POINTER, + byref, + c_char, + c_char_p, + c_int, + c_size_t, + c_void_p, +) + +_lib = ctypes.CDLL("libhiprtc.so") + + +_lib.hiprtcCreateProgram.restype = c_int +_lib.hiprtcCreateProgram.argtypes = [ + POINTER(c_void_p), + c_char_p, + c_char_p, + c_int, + POINTER(c_char_p), + POINTER(c_char_p), +] + +_lib.hiprtcDestroyProgram.restype = c_int +_lib.hiprtcDestroyProgram.argtypes = [ + POINTER(c_void_p), +] + +_lib.hiprtcCompileProgram.restype = c_int +_lib.hiprtcCompileProgram.argtypes = [ + c_void_p, + c_int, + POINTER(c_char_p), +] + +_lib.hiprtcGetProgramLogSize.restype = c_int +_lib.hiprtcGetProgramLogSize.argtypes = [ + c_void_p, + POINTER(c_size_t), +] + +_lib.hiprtcGetProgramLog.restype = c_int +_lib.hiprtcGetProgramLog.argtypes = [ + c_void_p, + c_char_p, +] + +_lib.hiprtcGetCodeSize.restype = c_int +_lib.hiprtcGetCodeSize.argtypes = [ + c_void_p, + POINTER(c_size_t), +] + +_lib.hiprtcGetCode.restype = c_int +_lib.hiprtcGetCode.argtypes = [ + c_void_p, + c_char_p, +] + +_lib.hiprtcAddNameExpression.restype = c_int +_lib.hiprtcAddNameExpression.argtypes = [ + c_void_p, + c_char_p, +] + +_lib.hiprtcGetLoweredName.restype = c_int +_lib.hiprtcGetLoweredName.argtypes = [c_void_p, c_char_p, POINTER(c_char_p)] + + +class HIPRTCError(Exception): + def __init__(self, code: int) -> None: + self.code = code + self.message = f"HIP Error {self.code}" + + def __str__(self) -> str: + return self.message + + +class HIPRTCProgram: + def __init__(self, handle: POINTER) -> None: + self.handle = handle + + def __del__(self) -> None: + _lib.hiprtcDestroyProgram(self.handle) + + +# TODO: Handle headers +def hiprtcCreateProgram(src: str, name: str) -> HIPRTCProgram: + + src_bytes = src.encode("utf-8") + name_bytes = name.encode("utf-8") + + prog = c_void_p() + + res = _lib.hiprtcCreateProgram(byref(prog), src_bytes, name_bytes, 0, None, None) + + if res != 0: + raise HIPRTCError(res) + + return HIPRTCProgram(prog) + + +# TODO: Handle compile options +def hiprtcCompileProgram(prog: HIPRTCProgram) -> None: + + res = _lib.hiprtcCompileProgram(prog.handle, 0, None) + + if res != 0: + raise HIPRTCError(res) + + +def hiprtcGetProgramLogSize(prog: HIPRTCProgram) -> int: + + size = c_size_t(0) + + res = _lib.hiprtcGetProgramLogSize(prog.handle, byref(size)) + + if res != 0: + raise HIPRTCError(res) + + return size.value + + +def hiprtcGetProgramLog(prog: HIPRTCProgram) -> str: + + size = hiprtcGetProgramLogSize(prog) + buf = (ctypes.c_char * size)() + + res = _lib.hiprtcGetProgramLog(prog.handle, buf) + + if res != 0: + raise HIPRTCError(res) + + return ctypes.string_at(buf, size).decode("utf-8", errors="ignore") + + +def hiprtcGetCodeSize(prog: HIPRTCProgram) -> int: + size = c_size_t(0) + res = _lib.hiprtcGetCodeSize(prog.handle, byref(size)) + + if res != 0: + raise HIPRTCError(res) + + return size.value + + +def hiprtcGetCode(prog: HIPRTCProgram) -> POINTER: + + size = hiprtcGetCodeSize(prog) + buf = (c_char * size)() + res = _lib.hiprtcGetCode(prog.handle, buf) + + if res != 0: + raise HIPRTCError(res) + + return buf + + +def hiprtcGetLoweredName(prog: HIPRTCProgram, name_expression: str) -> str: + + expr_bytes = name_expression.encode("utf-8") + name_bytes = c_char_p() + + res = _lib.hiprtcGetLoweredName(prog.handle, expr_bytes, name_bytes) + + if res != 0: + raise HIPRTCError(res) + + return name_bytes.value.decode("utf-8") + + +def hiprtcAddNameExpression(prog: HIPRTCProgram, name_expression: str) -> None: + + expr_bytes = name_expression.encode("utf-8") + + res = _lib.hiprtcAddNameExpression(prog.handle, expr_bytes) + + if res != 0: + raise HIPRTCError(res) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py b/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py index ab1e284b7b..1d985e49f1 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py @@ -36,6 +36,7 @@ import yaml import config from roofline import Roofline +from utils import benchmark from utils.amdsmi_interface import amdsmi_ctx, get_gpu_model, get_mem_max_clock from utils.logger import ( console_debug, @@ -53,7 +54,6 @@ from utils.utils import ( convert_metric_id_to_panel_info, get_panel_alias, is_tcc_channel_counter, - mibench, parse_sets_yaml, ) @@ -680,7 +680,9 @@ class OmniSoC_Base: "roofline", f"Checking for roofline.csv in {self.get_args().path}" ) if not (Path(self.get_args().path) / "roofline.csv").is_file(): - mibench(self.get_args(), self._mspec) + result = benchmark.run_on_devices([self.get_args().device]) + benchmark.dump_csv(result, f"{self.get_args().path}/roofline.csv") + self.roofline_obj.post_processing() @abstractmethod diff --git a/projects/rocprofiler-compute/src/utils/benchmark.py b/projects/rocprofiler-compute/src/utils/benchmark.py new file mode 100644 index 0000000000..edf7bfee24 --- /dev/null +++ b/projects/rocprofiler-compute/src/utils/benchmark.py @@ -0,0 +1,1148 @@ +############################################################################## +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +############################################################################## + +import csv +import math +from collections import namedtuple +from ctypes import ( + POINTER, + byref, + c_double, + c_float, + c_int, + c_int8, + c_int32, + c_int64, + c_short, + c_void_p, + cast, + sizeof, +) +from typing import Any + +import hip.hip as hip +import hip.hiprtc as hiprtc + +lds_sizes = { + "gfx908": 64 * 1024, + "gfx90a": 64 * 1024, + "gfx940": 64 * 1024, + "gfx941": 64 * 1024, + "gfx942": 64 * 1024, + "gfx950": 64 * 1024, +} + +unsupported_data_types = { + "gfx908": [ + "MALL", + "MFMA-F4", + "MFMA-F6", + "MFMA-F8", + "MFMA-F16", + "MFMA-BF16", + "MFMA-F64", + "MFMA-I8", + ], # MI100 series + "gfx90a": ["MALL", "MFMA-F4", "MFMA-F6", "MFMA-F8"], # MI200 series + "gfx940": ["MFMA-F4", "MFMA-F6"], # MI300A_A0 + "gfx941": ["MFMA-F4", "MFMA-F6"], # MI300X_A0 + "gfx942": ["MFMA-F4", "MFMA-F6"], # MI300A_A1, MI300X_A1, MI308 + "gfx950": [], # MI350, MI355 +} + +cache_kernel_selector = { + "L1": { + "gfx908": "Cache_bw", + "gfx90a": "Cache_bw", + "gfx940": "Cache_bw", + "gfx941": "Cache_bw", + "gfx942": "Cache_bw", + "gfx950": "Cache_bw", + }, + "L2": { + "gfx908": "Cache_bw", + "gfx90a": "Cache_bw", + "gfx940": "Cache_bw", + "gfx941": "Cache_bw", + "gfx942": "Cache_bw", + "gfx950": "Cache_bw", + }, + "MALL": { + "gfx940": "Cache_bw", + "gfx941": "Cache_bw", + "gfx942": "Cache_bw", + "gfx950": "Cache_bw", + }, +} + +mfma_kernel_selector = { + "F4": "mfma_f8f6f4", + "F6": "mfma_f8f6f4", + "F8": "mfma_f8", + "F16": "mfma_f16", + "BF16": "mfma_bf16", + "F32": "mfma_f32", + "F64": "mfma_f64", + "I8": "mfma_i8", +} + +flops_kernel_selector = { + "FP16": ["flops_benchmark<__half, 1024>", sizeof(c_short)], + "FP32": ["flops_benchmark", sizeof(c_float)], + "FP64": ["flops_benchmark", sizeof(c_double)], + "INT8": ["flops_benchmark", sizeof(c_int8)], + "INT32": ["flops_benchmark", sizeof(c_int32)], + "INT64": ["flops_benchmark", sizeof(c_int64)], +} + +mfma_ops = { + "F4": {"gfx950": 131072}, + "F6": {"gfx950": 131072}, + "F8": dict.fromkeys(["gfx90a", "gfx940", "gfx941", "gfx942", "gfx950"], 32768), + "F16": dict.fromkeys(["gfx90a", "gfx940", "gfx941", "gfx942", "gfx950"], 16384), + "F32": dict.fromkeys( + ["gfx908", "gfx90a", "gfx940", "gfx941", "gfx942", "gfx950"], 4096 + ), + "BF16": dict.fromkeys(["gfx940", "gfx941", "gfx942", "gfx950"], 16384) + | dict.fromkeys(["gfx90a"], 8192), + "I8": dict.fromkeys(["gfx940", "gfx941", "gfx942", "gfx950"], 32768) + | dict.fromkeys(["gfx90a"], 16384), + "F64": dict.fromkeys(["gfx90a", "gfx940", "gfx941", "gfx942", "gfx950"], 2048), +} + +cache_sizes = { + "L1": { + "gfx908": 16 * 1024, + "gfx90a": 16 * 1024, + "gfx940": 32 * 1024, + "gfx941": 32 * 1024, + "gfx942": 32 * 1024, + "gfx950": 32 * 1024, + }, + "L2": { + "gfx908": 8 * 1024 * 1024, + "gfx90a": 8 * 1024 * 1024, + "gfx940": 4 * 1024 * 1024, + "gfx941": 4 * 1024 * 1024, + "gfx942": 4 * 1024 * 1024, + "gfx950": 4 * 1024 * 1024, + }, + "MALL": { + "gfx940": 64 * 1024 * 1024, + "gfx941": 64 * 1024 * 1024, + "gfx942": 64 * 1024 * 1024, + "gfx950": 64 * 1024 * 1024, + }, +} + + +Stats = namedtuple("Stats", ["mean", "stdev", "confidence"]) +PerfMetrics = namedtuple("PerfMetrics", ["mean", "low", "high"]) + +DEFAULT_WORKGROUP_SIZE = 256 +DEFAULT_WORKGROUPS = 8192 +DEFAULT_THREADS = DEFAULT_WORKGROUP_SIZE * DEFAULT_WORKGROUPS +DEFAULT_NUM_EXPERIMENTS = 100 +DEFAULT_NUM_ITERS = 10 +DEFAULT_DATASET_SIZE = 512 * 1024 * 1024 + + +def show_progress(pct: float) -> None: + + bar_char = "|" + bar_size = 60 + + count = int(bar_size * pct) + bar = "[" + bar_char * count + " " * (bar_size - count) + "]" + + print(f"\r{int(pct * 100):3d}% {bar}", end="", flush=True) + + +# Returns a named tuple with the mean, std deviation and confidence +def calc_stats(samples: list) -> Stats: + + mean = sum(samples) / len(samples) + + stdev = 0.0 + + for i in range(len(samples)): + stdev += math.pow(samples[i] - mean, 2) + + stdev = math.sqrt(stdev / len(samples)) + + return Stats(mean, stdev, 1.96 * stdev / math.sqrt(len(samples))) + + +# Helper class for loading and compiling kerels +class Program: + def __init__(self, src: str, templates: list[str] = []) -> None: + self.prog = hiprtc.hiprtcCreateProgram(src, "prog") + + for t in templates: + hiprtc.hiprtcAddNameExpression(self.prog, t) + try: + hiprtc.hiprtcCompileProgram(self.prog) + except hiprtc.HIPRTCError as e: + log = hiprtc.hiprtcGetProgramLog(self.prog) + print(f"Program log: {log}") + raise e + + self.code = hiprtc.hiprtcGetCode(self.prog) + self.module = hip.hipModuleLoadData(self.code) + + def get_kernel(self, kernel_name: str) -> POINTER: + + # TODO: Why doesn't hiprtcGetLoweredName work with non-template functions? + if "<" in kernel_name: + kernel_name = hiprtc.hiprtcGetLoweredName(self.prog, kernel_name) + + return hip.hipModuleGetFunction(self.module, kernel_name) + + +# Helper method for launching kernel +def launch_kernel( + func: POINTER, + grid_size: list[int], + block_size: list[int], + shared_mem_size: int, + stream: POINTER, + args: list[Any] = [], +) -> None: + + # Convert to native types + args_converted = [] + for arg in args: + if isinstance(arg, int): + args_converted.append(c_int(arg)) + elif isinstance(arg, hip.HIPDeviceMemory): + args_converted.append(arg.ptr) + else: + args_converted.append(arg) + + # Convert to void pointers + normalized = [cast(byref(arg), c_void_p) for arg in args_converted] + + args_ptr = (c_void_p * len(args))(*normalized) + + hip.hipModuleLaunchKernel( + func, + grid_size[0], + grid_size[1], + grid_size[2], + block_size[0], + block_size[1], + block_size[2], + shared_mem_size, + stream, + args_ptr, + ) + + +# Retrieve the gfx architecture +def get_gfx_arch(device: int) -> str: + + arch_str = hip.hipGetDeviceProperties(device).gcnArchName + + # Parse out only gfx + return arch_str.split(":", 1)[0] + + +# Helper method to run a kernel and collect samples +def run_get_samples( + count: int, + work_per_kernel: int, + func: POINTER, + grid_size: list[int], + block_size: list[int], + shared_mem_size: int, + stream: POINTER, + args: list[Any] = [], +) -> list[float]: + + event_start = hip.hipEventCreate() + event_stop = hip.hipEventCreate() + + samples = [] + for i in range(count): + hip.hipEventRecord(event_start) + launch_kernel( + func, + grid_size, + block_size, + shared_mem_size, + stream, + args, + ) + hip.hipEventRecord(event_stop) + hip.hipDeviceSynchronize() + show_progress(float(i + 1) / count) + event_ms = hip.hipEventElapsedTime(event_start, event_stop) + + samples.append(float(work_per_kernel) / event_ms / 1e6) + + print() + + return samples + + +cache_bw_src = """ +template +__global__ void Cache_bw(const T *memBlock, T *dummy, int numIter) +{ + const int thread_id = threadIdx.x; + constexpr int cache_count = cacheSize / sizeof(T); + + T sink; + + sink = 0; + for (int iter = 0; iter < numIter; ++iter) + { +#pragma unroll 32 + for (int i = 0; i < cache_count; i += workgroup_size) + { + // if the size of the memory block is small (e.g., the size + // of L1), then we need a slightly more complicated index + // calculation. Otherwise, the compiler holds all the loads + // in the inner loop in registers upon the first pass of the + // outer loop, and it doesn't do the loads upon subsequent + // passes of the outer loop. + // OTOH, if the size of the memory block is larger (such as L2 + // size), experimentation showed that the overhead of the more + // complicated index calculation has a noticeable effect on BW, + // so we use a simpler index expression instead. This works since + // for larger memory blocks, the compiler cannot hold the loads + // of the inner loop in registers anymore, as it can with L1-sized + // buffers. + if constexpr (cache_count / workgroup_size <= 32) + { + sink += memBlock[(thread_id + i + iter) % cache_count]; + } + else + { + sink += memBlock[thread_id + i]; + } + } + } + + dummy[thread_id] = sink; +} +""" + +hbm_bw_src = """ +template +__global__ void HBM_bw(T *dst, const T *src) +{ + const unsigned int gid = blockDim.x * blockIdx.x + threadIdx.x; + const unsigned int tid = threadIdx.x; + + dst[gid] = src[gid]; +} +""" + + +def hbm_bw_benchmark(device: int) -> PerfMetrics: + num_experiments = DEFAULT_NUM_EXPERIMENTS + hip.hipSetDevice(device) + + cus = hip.hipGetDeviceProperties(device).multiProcessorCount + + prog = Program(hbm_bw_src, ["HBM_bw"]) + func = prog.get_kernel("HBM_bw") + + workgroup_size = DEFAULT_WORKGROUP_SIZE + workgroups_per_cu = 20 * 1024 + workgroups = cus * workgroups_per_cu + dataset_entries = workgroups * workgroup_size + + d_src = hip.hipMalloc(dataset_entries * sizeof(c_double)) + d_dst = hip.hipMalloc(dataset_entries * sizeof(c_double)) + + total_bytes = dataset_entries * sizeof(c_double) * 2 + + launch_kernel( + func, [workgroups, 1, 1], [workgroup_size, 1, 1], 0, None, [d_dst, d_src] + ) + hip.hipDeviceSynchronize() + + samples = run_get_samples( + num_experiments, + total_bytes, + func, + [workgroups, 1, 1], + [workgroup_size, 1, 1], + 0, + None, + [d_dst, d_src], + ) + + stats = calc_stats(samples) + + mean = stats.mean + stdev = stats.stdev + + perf_metrics = PerfMetrics(mean, mean - stats.confidence, mean + stats.confidence) + + event_ms = total_bytes / mean / 1e6 + print( + f"HBM BW, GPU ID: {device}, workgroupSize:{workgroup_size}, workgroups:{workgroups}, experiments:{num_experiments}, traffic:{total_bytes} bytes, duration:{event_ms:.1f} ms, mean:{mean:.1f} GB/sec, stdev={stdev:.1f} GB/sec" + ) + + return perf_metrics + + +def cache_bw_bench(device: int, type: str, iters: int) -> PerfMetrics: + hip.hipSetDevice(device) + + num_experiments = DEFAULT_NUM_EXPERIMENTS + workgroup_size = DEFAULT_WORKGROUP_SIZE + + cus = hip.hipGetDeviceProperties(device).multiProcessorCount + + arch = get_gfx_arch(device) + cache_size = cache_sizes[type][arch] + + mem_block = hip.hipMalloc(cache_size) + dummy = hip.hipMalloc(workgroup_size * sizeof(c_float)) + + kernel_name = cache_kernel_selector[type][arch] + prog = Program(cache_bw_src, [kernel_name]) + func = prog.get_kernel(kernel_name) + + workgroups = 128 * cus + total_bytes = workgroups * iters * cache_size + + launch_kernel( + func, + [workgroups, 1, 1], + [workgroup_size, 1, 1], + 0, + None, + [mem_block, dummy, iters], + ) + hip.hipDeviceSynchronize() + + samples = run_get_samples( + num_experiments, + total_bytes, + func, + [workgroups, 1, 1], + [workgroup_size, 1, 1], + 0, + None, + [mem_block, dummy, iters], + ) + + stats = calc_stats(samples) + mean = stats.mean + stdev = stats.stdev + + perf_metrics = PerfMetrics(mean, mean - stats.confidence, mean + stats.confidence) + + event_ms = total_bytes / mean / 1e6 + + print( + f"{type} BW, GPU ID: {device}, workgroupSize:{workgroup_size}, workgroups:{workgroups}, experiments:{num_experiments}, traffic:{total_bytes} bytes, duration:{event_ms:.1f} ms, mean:{mean:.1f} GB/sec, stdev={stdev:1f} GB/sec" + ) + + return perf_metrics + + +def mall_bw_bench(device: int) -> PerfMetrics: + return cache_bw_bench(device, "MALL", 1) + + +def l1_bw_bench(device: int) -> PerfMetrics: + return cache_bw_bench(device, "L1", 100) + + +def l2_bw_bench(device: int) -> PerfMetrics: + return cache_bw_bench(device, "L2", 10) + + +lds_benchmark_src = """ +extern "C" __global__ void LDS_bw(int numIter, float *dummy) +{ + const int tid = threadIdx.x; + __shared__ unsigned char shmem[64]; + + + if (tid == 0) + { + #pragma unroll + for (int i=0;i<63;i++) + shmem[i] = i+1; + + shmem[63] = 0; + } + + __syncthreads(); + + int index = tid; + #pragma unroll 64 + for(int iter = 0; iter < numIter; iter++) + index = shmem[index]; + + dummy[tid] = (float )index; +} + +""" + + +def lds_bw_benchmark(device: int) -> PerfMetrics: + + num_experiments = DEFAULT_NUM_EXPERIMENTS + workgroup_size = DEFAULT_WORKGROUP_SIZE + + cus = hip.hipGetDeviceProperties(device).multiProcessorCount + + iters = 2000 + + workgroups = 128 * cus + total_bytes = workgroups * workgroup_size * iters * sizeof(c_float) + + dummy = hip.hipMalloc(workgroup_size * sizeof(c_float)) + + prog = Program(lds_benchmark_src) + func = prog.get_kernel("LDS_bw") + + # Warmup + launch_kernel( + func, [workgroups, 1, 1], [workgroup_size, 1, 1], 0, None, [iters, dummy] + ) + hip.hipDeviceSynchronize() + + samples = run_get_samples( + num_experiments, + total_bytes, + func, + [workgroups, 1, 1], + [workgroup_size, 1, 1], + 0, + None, + [iters, dummy], + ) + + stats = calc_stats(samples) + mean = stats.mean + stdev = stats.stdev + + perf_metrics = PerfMetrics(mean, mean - stats.confidence, mean + stats.confidence) + + event_ms = total_bytes / mean / 1e6 + + print( + f"LDS BW, GPU ID: {device}, workgroupSize:{workgroup_size}, workgroups:{workgroups}, experiments:{num_experiments}, traffic:{total_bytes} bytes, duration:{event_ms:.1f} ms, mean:{mean:.1f} GB/sec, stdev={stdev:1f} GB/sec" + ) + + return perf_metrics + + +flops_benchmark_src = """ +template +__global__ void flops_benchmark(T *buf, int nSize) +{ + const int gid = blockDim.x * blockIdx.x + threadIdx.x; + const int nThreads = gridDim.x * blockDim.x; + const int nEntriesPerThread = (int) nSize / nThreads; + const int maxOffset = nEntriesPerThread * nThreads; + + T *ptr; + const T y = (T) 1.1; + + ptr = &buf[gid]; + T x = (T) 2.0; + + for(int offset=0; offset < maxOffset; offset += nThreads) + { + for(int j=0; j PerfMetrics: + num_experiments = DEFAULT_NUM_EXPERIMENTS + workgroup_size = DEFAULT_WORKGROUP_SIZE + dataset_size = DEFAULT_DATASET_SIZE + cus = hip.hipGetDeviceProperties(device).multiProcessorCount + + memblock = hip.hipMalloc(dataset_size) + workgroups = 128 * cus + threads = workgroups * workgroup_size + + kernel_name = flops_kernel_selector[type][0] + type_size = flops_kernel_selector[type][1] + + n_size = dataset_size // type_size // threads * threads + + total_flops = n_size * 1024 * 2 + + prog = Program(flops_benchmark_src, [kernel_name]) + + func = prog.get_kernel(kernel_name) + + # Warmup + launch_kernel( + func, [workgroups, 1, 1], [workgroup_size, 1, 1], 0, None, [memblock, n_size] + ) + hip.hipDeviceSynchronize() + + samples = run_get_samples( + num_experiments, + total_flops, + func, + [workgroups, 1, 1], + [workgroup_size, 1, 1], + 0, + None, + [memblock, n_size], + ) + + stats = calc_stats(samples) + mean = stats.mean + stdev = stats.stdev + + perf_metrics = PerfMetrics(mean, mean - stats.confidence, mean + stats.confidence) + + event_ms = total_flops / mean / 1e6 + + print( + f"Peak {unit}s ({type}), GPU ID: {device}, workgroupSize:{workgroup_size}, workgroups:{workgroups}, experiments:{num_experiments}, {unit}:{total_flops}, duration:{event_ms:.1f} ms, mean:{mean:.1f} {rate}, stdev={stdev:.1f} GFLOPS" + ) + + return perf_metrics + + +mfma_f32_src = """ +using f32_16vec = __attribute__((__vector_size__(16 * sizeof(float)))) float; + +extern "C" __global__ void mfma_f32(int iter, float *dummy) +{ + // Input: 1 F32 register + float a = threadIdx.x; + + // Output: 16 F32 registers + f32_16vec result = {0}; + + // CDNA2: v_mfma_f32_32x32x2f32 ops: 32x32x2x2 = 4096 + // CDNA3: v_mfma_f32_32x32x2_f32 + for(int i = 0; i < iter; ++i) + { + result = __builtin_amdgcn_mfma_f32_32x32x2f32(a, a, result, 0, 0, 0); + } + + if (result[0] != 2*result[0]) + { + dummy[0] = result[0]; + } +} +""" + +mfma_f16_src = """ + +using f32_16vec = __attribute__((__vector_size__(16 * sizeof(float)))) float; +using f16_2vec = __attribute__((__vector_size__(2 * sizeof(__2f16)))) float; + +extern "C" __global__ void mfma_f16(int iter, float *dummy) +{ + // Input: 2 F32 registers + f16_2vec a; + a[1] = a[0] = threadIdx.x; + + //Output: 16 F32 registers + f32_16vec result = {0}; + + // CDNA2: v_mfma_f32_32x32x8f16 ops: 32x32x8x2 = 16384 + // CDNA3: v_mfma_f32_32x32x8_f16 + for(int i = 0; i < iter; ++i) + { + result = __builtin_amdgcn_mfma_f32_32x32x8f16(a, a, result, 0, 0, 0); + } + + if (result[0] != 2*result[0]) + { + dummy[0] = result[0]; + } +} +""" + +mfma_bf16_src = """ + +using f32_16vec = __attribute__((__vector_size__(16 * sizeof(float)))) float; +using bf16_4vec = __attribute__((__vector_size__(2 * sizeof(__2i16)))) short; +using bf16_2vec = __attribute__((__vector_size__(1 * sizeof(__2i16)))) short; + +extern "C" __global__ void mfma_bf16(int iter, float *dummy) +{ + // Output: 16 F32 registers + f32_16vec result = {0}; + +// MI100/MI200 +#if defined(__gfx908__) or defined(__gfx90a__) + // Input: 1 F32 register + // builtin mfma expects 2 short registers + bf16_2vec a; + a[1] = a[0]= threadIdx.x; + + // CDNA1/2: v_mfma_f32_32x32x4bf16 ops: 32x32x4x2 = 8192 + for(int i = 0; i < iter; ++i) + { + result = __builtin_amdgcn_mfma_f32_32x32x4bf16(a, a, result, 0, 0, 0); + } +//MI300 series +#else + // Input: 2 F32 registers + // builting mfma expects 4 short registers + bf16_4vec a; + a[3] = a[2] = a[1] = a[0]= threadIdx.x; + + // CDNA3: v_mfma_f32_32x32x8_bf16 ops: 32x32x8x2 = 16384 + for(int i = 0; i < iter; ++i) + { + result = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(a, a, result, 0, 0, 0); + } +#endif + + if (result[0] != 2*result[0]) + { + dummy[0] = result[0]; + } +} +""" + +mfma_f64_src = """ + +using f64_4vec = __attribute__((__vector_size__(4 * sizeof(double)))) double; + +extern "C" __global__ void mfma_f64(int iter, float *dummy) +{ + // MI200 and above + // Input: 1 F64 register + double a = threadIdx.x; + + // Output: 4 F64 registers + f64_4vec result = {0}; + + // CDNA2: v_mfma_f64_16x16x4f64 ops: 16x16x4x2 = 2048 + // CDNA3: v_mfma_f64_16x16x4_f64 + for(int i = 0; i < iter; ++i) + { + result = __builtin_amdgcn_mfma_f64_16x16x4f64(a, a, result, 0, 0, 0); + } + + if (result[0] != 2*result[0]) + { + dummy[0] = result[0]; + } +} +""" + +mfma_i8_src = """ +using int32_8vec = __attribute__((__vector_size__(8 * sizeof(int)))) int; +using int32_16vec = __attribute__((__vector_size__(16 * sizeof(int)))) int; + +extern "C" __global__ void mfma_i8(int iter, float *dummy) +{ + // Output: 16 I32 registers + int32_16vec result = {0}; + +// MI100/MI200 +#if defined(__gfx908__) or defined(__gfx90a__) + // Input: 1 I32 register + int a = threadIdx.x; + + // CDNA1/2: v_mfma_i32_32x32x8i8 ops: 32x32x8x2 = 16384 + for(int i = 0; i < iter; ++i) + { + result = __builtin_amdgcn_mfma_i32_32x32x8i8(a, a, result, 0, 0, 0); + } +// MI300 series +#else + // Input: 2 I32 registers + // builting mfma expects I64 input + long a = threadIdx.x; + + // CDNA3: v_mfma_i32_32x32x16_i8 ops: 32x32x16x2 = 32768 + for(int i = 0; i < iter; ++i) + { + result = __builtin_amdgcn_mfma_i32_32x32x16_i8(a, a, result, 0, 0, 0); + } +#endif + + if (result[0] != 2*result[0]) + { + dummy[0] = result[0]; + } +} +""" + +mfma_f8_src = """ + +using f32_16vec = __attribute__((__vector_size__(16 * sizeof(float)))) float; + +extern "C" __global__ void mfma_f8(int iter, float *dummy) +{ + // MI300 series only - note gfx940/gfx941/gfx942 only uses fnuz f8 + // Input: 2 F32 registers + // builtin mfma expects double input + double a = threadIdx.x; + + // Output: 16 F32 registers + f32_16vec result = {0}; + + // CDNA3: v_mfma_f32_32x32x16_fp8_fp8 ops: 32x32x16x2 = 32768 + for(int i = 0; i < iter; ++i) + { + result = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(a, a, result, 0, 0, 0); + } + + if (result[0] != 2*result[0]) + { + dummy[0] = result[0]; + } +} +""" + +mfma_f8f6f4_src = """ + +using int32_16vec = __attribute__((__vector_size__(16 * sizeof(int)))) int; +using int32_8vec = __attribute__((__vector_size__(8 * sizeof(int)))) int; +using bf16_2vec = __attribute__((__vector_size__(1 * sizeof(__2i16)))) short; +using bf16_4vec = __attribute__((__vector_size__(2 * sizeof(__2i16)))) short; +using f32_16vec = __attribute__((__vector_size__(16 * sizeof(float)))) float; +using f16_2vec = __attribute__((__vector_size__(2 * sizeof(__2f16)))) float; + +#define FP8_E4M3 0 +#define BF8_E5M2 1 +#define FP6_E2M3 2 +#define BF6_E3M2 3 +#define FP4_E2M1 4 + +template __global__ void mfma_f8f6f4(int iter, float *dummy) +{ + // MI350 series only + // Input: 8 i32 registers + int32_8vec a; + a[0] = a[1] = a[2] = a[3] = a[4] = a[5] = a[6] = a[7] = threadIdx.x; + + // Output: 16 F32 registers + f32_16vec result = {0}; + + // CDNA4: v_mfma_f32_32x32x64_f8f6f4 ops: 32x32x64x2 = 131072 + switch (datatype) + { + case FP8_E4M3: // fp8 x fp8 + for(int i = 0; i < iter; ++i) + { + result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, a, result, 0, 0, 0, 0, 0, 0); + } + case BF8_E5M2: // bf8 x bf8 + for(int i = 0; i < iter; ++i) + { + result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, a, result, 1, 1, 0, 0, 0, 0); + } + break; + case FP6_E2M3: // fp6 x fp6 + for(int i = 0; i < iter; ++i) + { + result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, a, result, 2, 2, 0, 0, 0, 0); + } + break; + case BF6_E3M2: // bf6 x bf6 + for(int i = 0; i < iter; ++i) + { + result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, a, result, 3, 3, 0, 0, 0, 0); + } + break; + case FP4_E2M1: // fp4 x fp4 + for(int i = 0; i < iter; ++i) + { + result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, a, result, 4, 4, 0, 0, 0, 0); + } + break; + } + + if (result[0] != 2*result[0]) + { + dummy[0] = result[0]; + } +} + +""" + + +def mfma_bench(device: int, type: str, unit: str, rate: int) -> PerfMetrics: + SIMDS_PER_CU = 4 + experiments = DEFAULT_NUM_EXPERIMENTS + iters = 2000 + + cus = hip.hipGetDeviceProperties(device).multiProcessorCount + + workgroups = 128 * cus + workgroup_size = DEFAULT_WORKGROUP_SIZE + + arch = get_gfx_arch(device) + total_flops = workgroups * SIMDS_PER_CU * iters * mfma_ops[type][arch] + + dummy = hip.hipMalloc(64 * sizeof(c_float)) + + kernel_name = mfma_kernel_selector[type] + + if type == "F32": + src = mfma_f32_src + elif type == "F8": + src = mfma_f8_src + elif type == "F16": + src = mfma_f16_src + elif type == "BF16": + src = mfma_bf16_src + elif type == "F64": + src = mfma_f64_src + elif type == "I8": + src = mfma_i8_src + else: + src = mfma_f8f6f4_src + + prog = Program(src, [kernel_name]) + func = prog.get_kernel(kernel_name) + + samples = run_get_samples( + experiments, + total_flops, + func, + [workgroups, 1, 1], + [workgroup_size, 1, 1], + 0, + None, + [iters, dummy], + ) + + stats = calc_stats(samples) + mean = stats.mean + stdev = stats.stdev + + perf_metrics = PerfMetrics(mean, mean - stats.confidence, mean + stats.confidence) + + event_ms = total_flops / mean / 1e6 + + print( + f"Peak MFMA {unit}s ({type}), GPU ID: {device}, workgroupSize:{workgroup_size}, workgroups:{workgroups}, experiments:{experiments}, {unit}:{total_flops}, duration:{event_ms:.2f} ms, mean:{mean:.1f} {rate}, stdev={stdev:.1f} GFLOPS" + ) + + return perf_metrics + + +def mfma_f32_bench(device: int) -> PerfMetrics: + + return mfma_bench(device, "F32", "FLOP", "GFLOPS") + + +def mfma_f16_bench(device: int) -> PerfMetrics: + + return mfma_bench(device, "F16", "FLOP", "GFLOPS") + + +def mfma_bf16_bench(device: int) -> PerfMetrics: + + return mfma_bench(device, "BF16", "FLOP", "GFLOPS") + + +def mfma_f64_bench(device: int) -> PerfMetrics: + + return mfma_bench(device, "F64", "FLOP", "GFLOPS") + + +def mfma_f8_bench(device: int) -> PerfMetrics: + + return mfma_bench(device, "F8", "FLOP", "GFLOPS") + + +def mfma_i8_bench(device: int) -> PerfMetrics: + + return mfma_bench(device, "I8", "IOP", "GOPS") + + +def mfma_f4_bench(device: int) -> PerfMetrics: + + return mfma_bench(device, "F4", "FLOP", "GFLOPS") + + +def mfma_f6_bench(device: int) -> PerfMetrics: + + return mfma_bench(device, "F6", "FLOP", "GFLOPS") + + +def fp16_benchmark(device: int) -> PerfMetrics: + + return flops_bench(device, "FP16", "FLOP", "GFLOPS") + + +def fp32_benchmark(device: int) -> PerfMetrics: + + return flops_bench(device, "FP32", "FLOP", "GFLOPS") + + +def fp64_benchmark(device: int) -> PerfMetrics: + + return flops_bench(device, "FP64", "FLOP", "GFLOPS") + + +def int8_benchmark(device: int) -> PerfMetrics: + + return flops_bench(device, "INT8", "IOP", "GOPS") + + +def int32_benchmark(device: int) -> PerfMetrics: + + return flops_bench(device, "INT32", "IOP", "GOPS") + + +def int64_benchmark(device: int) -> PerfMetrics: + + return flops_bench(device, "INT64", "IOP", "GOPS") + + +tests = { + "HBM": hbm_bw_benchmark, + "MALL": mall_bw_bench, + "L2": l2_bw_bench, + "L1": l1_bw_bench, + "LDS": lds_bw_benchmark, + "F16": fp16_benchmark, + "F32": fp32_benchmark, + "F64": fp64_benchmark, + "I8": int8_benchmark, + "I32": int32_benchmark, + "I64": int64_benchmark, + "MFMA-F4": mfma_f4_bench, + "MFMA-F6": mfma_f6_bench, + "MFMA-F8": mfma_f8_bench, + "MFMA-F16": mfma_f16_bench, + "MFMA-BF16": mfma_bf16_bench, + "MFMA-F32": mfma_f32_bench, + "MFMA-F64": mfma_f64_bench, + "MFMA-I8": mfma_i8_bench, +} + + +# Run the roofine tests on the specified device +def run_benchmark(device: int) -> dict[PerfMetrics]: + + metrics_dict = {} + + arch = get_gfx_arch(device) + cus = hip.hipGetDeviceProperties(device).multiProcessorCount + + print(f"GPU Device {device} ({arch}) with {cus} CUs: Profiling...") + + for name, func in tests.items(): + if arch in unsupported_data_types and name in unsupported_data_types[arch]: + print(f"Skipping {name}") + metrics = PerfMetrics(0, 0, 0) + else: + metrics = func(device) + + metrics_dict[name] = metrics + + return metrics_dict + + +# Run the benchmark test on the specified devices +# Returns a dictionary mapping device ID to dictionary of +# metrics +def run_on_devices(devices: list[int]) -> dict[dict[PerfMetrics]]: + + metrics = {} + for d in devices: + metrics[d] = run_benchmark(d) + + return metrics + + +def dump_csv(metrics: dict[dict[PerfMetrics]], file_path: str) -> None: + # TODO: Better way to map CSV column names? + csv_cols_map = { + "HBM": "HBMBw", + "MALL": "MALLBw", + "L2": "L2Bw", + "L1": "L1Bw", + "LDS": "LDSBw", + "F16": "FP16Flops", + "F32": "FP32Flops", + "F64": "FP64Flops", + "I8": "I8Ops", + "I32": "I32Ops", + "I64": "I64Ops", + "MFMA-F4": "MFMAF4Flops", + "MFMA-F6": "MFMAF6Flops", + "MFMA-F8": "MFMAF8Flops", + "MFMA-F16": "MFMAF16Flops", + "MFMA-BF16": "MFMABF16Flops", + "MFMA-F32": "MFMAF32Flops", + "MFMA-F64": "MFMAF64Flops", + "MFMA-I8": "MFMAI8Ops", + } + + with open(file_path, "w") as f: + writer = csv.writer(f) + + types = csv_cols_map.keys() + + # Write the first row (col names) + row = ["device"] + for t in types: + row.append(csv_cols_map[t]) + row.append(csv_cols_map[t] + "Low") + row.append(csv_cols_map[t] + "High") + + writer.writerow(row) + + for d in metrics: + row = [d] + for t in types: + row.append(metrics[d][t].mean) + row.append(metrics[d][t].low) + row.append(metrics[d][t].high) + + writer.writerow(row) + + +if __name__ == "__main__": + import sys + + device_id = 0 + + if len(sys.argv) >= 3: + if sys.argv[1] == "-d": + device_id = int(sys.argv[2]) + + metrics = run_on_devices([device_id]) + dump_csv(metrics, "roofline.csv") diff --git a/projects/rocprofiler-compute/src/utils/rooflines/roofline-azurelinux3-rocm7 b/projects/rocprofiler-compute/src/utils/rooflines/roofline-azurelinux3-rocm7 deleted file mode 100755 index 4e38a84874..0000000000 Binary files a/projects/rocprofiler-compute/src/utils/rooflines/roofline-azurelinux3-rocm7 and /dev/null differ diff --git a/projects/rocprofiler-compute/src/utils/rooflines/roofline-rhel8-rocm6 b/projects/rocprofiler-compute/src/utils/rooflines/roofline-rhel8-rocm6 deleted file mode 100755 index 4b333d7f9c..0000000000 Binary files a/projects/rocprofiler-compute/src/utils/rooflines/roofline-rhel8-rocm6 and /dev/null differ diff --git a/projects/rocprofiler-compute/src/utils/rooflines/roofline-rhel8-rocm7 b/projects/rocprofiler-compute/src/utils/rooflines/roofline-rhel8-rocm7 deleted file mode 100755 index 57cc8ae1ce..0000000000 Binary files a/projects/rocprofiler-compute/src/utils/rooflines/roofline-rhel8-rocm7 and /dev/null differ diff --git a/projects/rocprofiler-compute/src/utils/rooflines/roofline-sles15sp6-rocm6 b/projects/rocprofiler-compute/src/utils/rooflines/roofline-sles15sp6-rocm6 deleted file mode 100755 index 5a31bcb631..0000000000 Binary files a/projects/rocprofiler-compute/src/utils/rooflines/roofline-sles15sp6-rocm6 and /dev/null differ diff --git a/projects/rocprofiler-compute/src/utils/rooflines/roofline-sles15sp6-rocm7 b/projects/rocprofiler-compute/src/utils/rooflines/roofline-sles15sp6-rocm7 deleted file mode 100755 index 895207bdda..0000000000 Binary files a/projects/rocprofiler-compute/src/utils/rooflines/roofline-sles15sp6-rocm7 and /dev/null differ diff --git a/projects/rocprofiler-compute/src/utils/rooflines/roofline-ubuntu22_04-rocm6 b/projects/rocprofiler-compute/src/utils/rooflines/roofline-ubuntu22_04-rocm6 deleted file mode 100755 index e4c5ff8413..0000000000 Binary files a/projects/rocprofiler-compute/src/utils/rooflines/roofline-ubuntu22_04-rocm6 and /dev/null differ diff --git a/projects/rocprofiler-compute/src/utils/rooflines/roofline-ubuntu22_04-rocm7 b/projects/rocprofiler-compute/src/utils/rooflines/roofline-ubuntu22_04-rocm7 deleted file mode 100755 index 768940c737..0000000000 Binary files a/projects/rocprofiler-compute/src/utils/rooflines/roofline-ubuntu22_04-rocm7 and /dev/null differ diff --git a/projects/rocprofiler-compute/src/utils/utils.py b/projects/rocprofiler-compute/src/utils/utils.py index f299d825b7..2f60506002 100644 --- a/projects/rocprofiler-compute/src/utils/utils.py +++ b/projects/rocprofiler-compute/src/utils/utils.py @@ -1267,120 +1267,6 @@ def gen_sysinfo( df.to_csv(workload_dir + "/" + "sysinfo.csv", index=False) -def detect_roofline(mspec: Any) -> dict[str, str]: # noqa: ANN401 - from utils import specs - - rocm_ver = int(mspec.rocm_version[:1]) - - target_binary: dict[str, Any] = { - "rocm_ver": rocm_ver, - "distro": "override", - "path": None, - } - - # Create distro ID list based off of ID (a string, containing a single distro) - # and ID_LIKE (a string, listing at least one distro, separated by a single space) - # from the system /etc/os-release file - os_release = Path("/etc/os-release").read_text() - id_list = specs.search(r'^ID_LIKE="?(.*?)"?$', os_release) or "" - id = specs.search(r'^ID="?(.*?)"?$', os_release) or "" - id_list = id_list.split() + [id] - - if "ROOFLINE_BIN" in os.environ.keys(): - rooflineBinary = os.environ["ROOFLINE_BIN"] - if Path(rooflineBinary).exists(): - console_warning( - "roofline", - f"Detected user-supplied binary --> ROOFLINE_BIN = {rooflineBinary}\n", - ) - # distro stays marked as override and path value is substituted in - target_binary["path"] = rooflineBinary - return target_binary - else: - console_error( - "roofline", - "user-supplied path to binary not accessible --> " - f"ROOFLINE_BIN = {rooflineBinary}\n", - ) - - # check that the system OS is based off of one of the following distributions - elif "azurelinux" in id_list: - distro = "azurelinux" - - elif "debian" in id_list: - distro = "22.04" - - elif ("fedora" in id_list) or ("tencentos" in id_list): - distro = "platform:el8" - - elif "suse" in id_list: - distro = "15.6" - - else: - console_error( - "roofline", "Cannot find a valid binary for your operating system" - ) - - # distro gets assigned, to follow default roofline bin location and nomenclature - target_binary["distro"] = distro - return target_binary - - -def mibench(args: argparse.Namespace, mspec: Any) -> None: # noqa: ANN401 - """Run roofline microbenchmark to generate peek BW and FLOP measurements.""" - console_log("roofline", "No roofline data found. Generating...") - - distro_map = { - "platform:el8": "rhel8", - "15.6": "sles15sp6", - "22.04": "ubuntu22_04", - "azurelinux": "azurelinux3", - } - - binary_paths: list[str] = [] - - target_binary = detect_roofline(mspec) - if target_binary["distro"] == "override": - binary_paths.append(target_binary["path"]) - else: - # check two potential locations for roofline binaries due to differences in - # development usage vs formal install - potential_paths = [ - config.rocprof_compute_home / "utils" / "rooflines" / "roofline", - config.rocprof_compute_home.parent.parent / "bin" / "roofline", - ] - - for directory in potential_paths: - path_to_binary = ( - f"{directory}-{distro_map[target_binary['distro']]}" - f"-rocm{target_binary['rocm_ver']}" - ) - binary_paths.append(path_to_binary) - - # Distro is valid but cant find rocm ver - found = False - for binary_path in binary_paths: - if Path(binary_path).exists(): - found = True - path_to_binary = binary_path - break - - if not found: - console_error("roofline", f"Unable to locate expected binary ({binary_paths}).") - - my_args = [ - path_to_binary, - "-o", - f"{args.path}/roofline.csv", - "-d", - str(args.device), - ] - if args.quiet: - my_args += "--quiet" - - subprocess.run(my_args, check=True) - - def get_submodules(package_name: str) -> list[str]: """List all submodules for a target package""" import importlib diff --git a/projects/rocprofiler-compute/tests/test_utils.py b/projects/rocprofiler-compute/tests/test_utils.py index db06ac99ad..9b82490d25 100644 --- a/projects/rocprofiler-compute/tests/test_utils.py +++ b/projects/rocprofiler-compute/tests/test_utils.py @@ -32,10 +32,8 @@ import logging import os import re import shutil -import subprocess import tempfile from pathlib import Path -from types import SimpleNamespace from unittest import mock import pandas as pd @@ -4429,877 +4427,6 @@ def test_process_hip_trace_output_invalid_fbase_characters(tmp_path, monkeypatch utils_mod.process_hip_trace_output(workload_dir, fbase) -# ============================================================================== -# ROOFLINE DETECTION TESTS -# ============================================================================== - - -def test_ubuntu_detection(monkeypatch): - """ - Test Ubuntu detection. - - Args: - monkeypatch (pytest.MonkeyPatch): Pytest fixture for patching - - Returns: - Verifies that the function correctly identifies Ubuntu and - returns the appropriate distro - """ - mock_os_release = "ID=ubuntu\nID_LIKE=debian" - - def mock_path_read_text(self): - return mock_os_release - - monkeypatch.setattr("os.environ", {"keys": lambda: []}) - - monkeypatch.setattr("pathlib.Path.read_text", mock_path_read_text) - - def mock_search(pattern, text): - if "ID_LIKE" in pattern: - return "debian" - return None - - monkeypatch.setattr("utils.specs.search", mock_search) - - import utils.utils as utils_mod - - # Create an object with attribute value = 1 - result = utils_mod.detect_roofline(SimpleNamespace(rocm_version="0.x.x")) - - assert result["rocm_ver"] == 0 - - -def test_debian_detection(monkeypatch): - """ - Test Debian detection. - - Args: - monkeypatch (pytest.MonkeyPatch): Pytest fixture for patching - - Returns: - Verifies that the function correctly identifies Debian - and returns the appropriate distro - """ - mock_os_release = "ID=debian" - - def mock_path_read_text(self): - return mock_os_release - - monkeypatch.setattr("os.environ", {"keys": lambda: []}) - - monkeypatch.setattr("pathlib.Path.read_text", mock_path_read_text) - - def mock_search(pattern, text): - if "ID" in pattern: - return "debian" - return None - - monkeypatch.setattr("utils.specs.search", mock_search) - - import utils.utils as utils_mod - - # Create an object with attribute value = 1 - result = utils_mod.detect_roofline(SimpleNamespace(rocm_version="0.x.x")) - - assert result["rocm_ver"] == 0 - - -def test_rhel_detection(monkeypatch): - """ - Test RHEL distro detection. - - Args: - monkeypatch (pytest.MonkeyPatch): Pytest fixture for patching - - Returns: - Verifies that the function correctly identifies RHEL - and returns the appropriate distro - """ - mock_os_release = 'ID_LIKE="rhel fedora"\nID="rhel"' - - def mock_path_read_text(self): - return mock_os_release - - monkeypatch.setattr("os.environ", {"keys": lambda: []}) - - monkeypatch.setattr("pathlib.Path.read_text", mock_path_read_text) - monkeypatch.setattr("pathlib.Path.exists", lambda *a, **k: True) - - def mock_search(pattern, text): - if "ID_LIKE" in pattern: - return "rhel fedora" - return None - - monkeypatch.setattr("utils.specs.search", mock_search) - - import utils.utils as utils_mod - - result = utils_mod.detect_roofline(SimpleNamespace(rocm_version="7.x.x")) - - assert result["rocm_ver"] == 7 - - -def test_azl_detection(monkeypatch): - """ - Test Azure Linux distro detection. - - Args: - monkeypatch (pytest.MonkeyPatch): Pytest fixture for patching - - Returns: - Verifies that the function correctly identifies AZL - and returns the appropriate distro - """ - mock_os_release = "ID=azurelinux" - - def mock_path_read_text(self): - return mock_os_release - - monkeypatch.setattr("os.environ", {"keys": lambda: []}) - - monkeypatch.setattr("pathlib.Path.read_text", mock_path_read_text) - monkeypatch.setattr("pathlib.Path.exists", lambda *a, **k: True) - - def mock_search(pattern, text): - if "ID" in pattern: - return "azurelinux" - return None - - monkeypatch.setattr("utils.specs.search", mock_search) - - import utils.utils as utils_mod - - result = utils_mod.detect_roofline(SimpleNamespace(rocm_version="7.x.x")) - - assert result["rocm_ver"] == 7 - - -def test_sles_detection(monkeypatch): - """ - Test SLES detection. - - Args: - monkeypatch (pytest.MonkeyPatch): Pytest fixture for patching - - Returns: - Verifies that the function correctly identifies SLES - and returns the appropriate distro - """ - mock_os_release = 'ID="opensuse-leap"\nID_LIKE="suse opensuse"' - - def mock_path_read_text(self): - return mock_os_release - - monkeypatch.setattr("os.environ", {"keys": lambda: []}) - - monkeypatch.setattr("pathlib.Path.read_text", mock_path_read_text) - - def mock_search(pattern, text): - if "ID_LIKE" in pattern: - return "suse openuse" - return None - - monkeypatch.setattr("utils.specs.search", mock_search) - - import utils.utils as utils_mod - - result = utils_mod.detect_roofline(SimpleNamespace(rocm_version="0.x.x")) - - assert result["rocm_ver"] == 0 - - -# ============================================================================= -# TESTS FOR MIBENCH OUTPUT -# ============================================================================= - - -def test_mibench_override_distro_success(tmp_path, monkeypatch): - """ - Test mibench with override distro that successfully finds and executes binary. - - Args: - tmp_path (Path): Temporary directory for test files. - monkeypatch (pytest.MonkeyPatch): Pytest fixture for patching. - - Returns: - None: Asserts that override path is used and subprocess is called correctly. - """ - - class MockArgs: - path = str(tmp_path) - device = 0 - quiet = False - - class MockMspec: - pass - - override_binary_path = tmp_path / "custom_roofline" - override_binary_path.write_text("#!/bin/bash\necho 'roofline executed'") - override_binary_path.chmod(0o755) - - def mock_detect_roofline(mspec): - return { - "distro": "override", - "path": str(override_binary_path), - "rocm_ver": "0.x.x", - } - - subprocess_calls = [] - - def mock_subprocess_run(args, check=True): - subprocess_calls.append((args, check)) - - monkeypatch.setattr("utils.utils.detect_roofline", mock_detect_roofline) - monkeypatch.setattr("subprocess.run", mock_subprocess_run) - monkeypatch.setattr("utils.utils.console_log", lambda *a, **k: None) - - import utils.utils as utils_mod - - utils_mod.mibench(MockArgs(), SimpleNamespace(rocm_version="0.x.x")) - - assert len(subprocess_calls) == 1 - expected_args = [ # noqa - str(override_binary_path), - "-o", - str(tmp_path) + "/roofline.csv", - "-d", - "0", - ] - assert subprocess_calls[0][1] is True - - -def test_mibench_standard_distro_first_path_exists(tmp_path, monkeypatch): - """ - Test mibench with standard distro where first potential path exists. - - Args: - tmp_path (Path): Temporary directory for test files. - monkeypatch (pytest.MonkeyPatch): Pytest fixture for patching. - - Returns: - None: Asserts that first path is used when it exists. - """ - - class MockArgs: - path = str(tmp_path) - device = 1 - quiet = True - - class MockMspec: - pass - - rocprof_home = tmp_path / "rocprof_home" - install_root = tmp_path / "install_root" - rocprof_home.mkdir(parents=True) - install_root.mkdir(parents=True) - - first_path = rocprof_home / "utils" / "rooflines" - first_path.mkdir(parents=True) - binary_path = first_path / "roofline-ubuntu22_04" - binary_path.write_text("#!/bin/bash\necho 'roofline executed'") - binary_path.chmod(0o755) - - class MockConfig: - def __init__(self): - self.rocprof_compute_home = self.MockPath(rocprof_home, install_root) - - class MockPath: - def __init__(self, home_path, install_path): - self._home_path = home_path - self._install_path = install_path - self.parent = self.MockParent(install_path) - - def __str__(self): - return str(self._home_path) - - def __truediv__(self, other): - return self._home_path / other - - class MockParent: - def __init__(self, install_path): - self.parent = install_path - - def __truediv__(self, other): - return self.parent / other - - mock_config = MockConfig() - - def mock_detect_roofline(mspec): - return {"distro": "22.04", "rocm_ver": "0.x.x"} - - subprocess_calls = [] - - def mock_subprocess_run(args, check=True): - subprocess_calls.append((args, check)) - - monkeypatch.setattr("utils.utils.detect_roofline", mock_detect_roofline) - monkeypatch.setattr("utils.utils.config", mock_config) - monkeypatch.setattr("subprocess.run", mock_subprocess_run) - monkeypatch.setattr("utils.utils.console_log", lambda *a, **k: None) - monkeypatch.setattr("pathlib.Path.exists", lambda *a, **k: True) - - import utils.utils as utils_mod - - utils_mod.mibench(MockArgs(), SimpleNamespace(rocm_version="0.x.x")) - - assert len(subprocess_calls) == 1 - - -def test_mibench_standard_distro_second_path_exists(tmp_path, monkeypatch): - """ - Test mibench with standard distro where second potential path exists. - - Args: - tmp_path (Path): Temporary directory for test files. - monkeypatch (pytest.MonkeyPatch): Pytest fixture for patching. - - Returns: - None: Asserts that second path is used when first doesn't exist. - """ - - class MockArgs: - path = str(tmp_path) - device = 2 - quiet = False - - class MockMspec: - pass - - rocprof_home = tmp_path / "rocprof_home" - install_root = tmp_path / "install_root" - rocprof_home.mkdir(parents=True) - install_root.mkdir(parents=True) - - second_path = install_root / "bin" - second_path.mkdir(parents=True) - binary_path = second_path / "roofline-rhel8" - binary_path.write_text("#!/bin/bash\necho 'roofline executed'") - binary_path.chmod(0o755) - - class MockConfig: - def __init__(self): - self.rocprof_compute_home = self.MockPath(rocprof_home, install_root) - - class MockPath: - def __init__(self, home_path, install_path): - self._home_path = home_path - self._install_path = install_path - self.parent = self.MockParent(install_path) - - def __str__(self): - return str(self._home_path) - - def __truediv__(self, other): - return self._home_path / other - - class MockParent: - def __init__(self, install_path): - self.parent = install_path - - def __truediv__(self, other): - return self.parent / other - - mock_config = MockConfig() - - def mock_detect_roofline(mspec): - return {"distro": "platform:el8", "rocm_ver": "0.x.x"} - - subprocess_calls = [] - - def mock_subprocess_run(args, check=True): - subprocess_calls.append((args, check)) - - monkeypatch.setattr("utils.utils.detect_roofline", mock_detect_roofline) - monkeypatch.setattr("utils.utils.config", mock_config) - monkeypatch.setattr("subprocess.run", mock_subprocess_run) - monkeypatch.setattr("utils.utils.console_log", lambda *a, **k: None) - monkeypatch.setattr("pathlib.Path.exists", lambda *a, **k: True) - - import utils.utils as utils_mod - - utils_mod.mibench(MockArgs(), SimpleNamespace(rocm_version="0.x.x")) - - assert len(subprocess_calls) == 1 - expected_args = [ # noqa: F841 - str(binary_path), - "-o", - str(tmp_path) + "/roofline.csv", - "-d", - "2", - ] - - -def test_mibench_no_binary_found_error(tmp_path, monkeypatch): - """ - Test mibench when no binary paths exist, should call console_error. - - Args: - tmp_path (Path): Temporary directory for test files. - monkeypatch (pytest.MonkeyPatch): Pytest fixture for patching. - - Returns: - None: Asserts that console_error is called when no binaries are found. - """ - - class MockArgs: - path = str(tmp_path) - device = 0 - quiet = False - - class MockMspec: - pass - - rocprof_home = tmp_path / "rocprof_home" - install_root = tmp_path / "install_root" - rocprof_home.mkdir(parents=True) - install_root.mkdir(parents=True) - - class MockConfig: - def __init__(self): - self.rocprof_compute_home = self.MockPath(rocprof_home, install_root) - - class MockPath: - def __init__(self, home_path, install_path): - self._home_path = home_path - self._install_path = install_path - self.parent = self.MockParent(install_path) - - def __str__(self): - return str(self._home_path) - - def __truediv__(self, other): - return self._home_path / other - - class MockParent: - def __init__(self, install_path): - self.parent = install_path - - def __truediv__(self, other): - return self.parent / other - - mock_config = MockConfig() - - def mock_detect_roofline(mspec): - return {"distro": "15.6", "rocm_ver": "0.x.x"} - - console_error_calls = [] - - def mock_console_error(category, msg): - console_error_calls.append((category, msg)) - raise RuntimeError("console_error called") - - monkeypatch.setattr("utils.utils.detect_roofline", mock_detect_roofline) - monkeypatch.setattr("utils.utils.config", mock_config) - monkeypatch.setattr("utils.utils.console_error", mock_console_error) - monkeypatch.setattr("utils.utils.console_log", lambda *a, **k: None) - - import utils.utils as utils_mod - - with pytest.raises(RuntimeError, match="console_error called"): - utils_mod.mibench(MockArgs(), SimpleNamespace(rocm_version="0.x.x")) - - assert len(console_error_calls) == 1 - assert console_error_calls[0][0] == "roofline" - assert "Unable to locate expected binary" in console_error_calls[0][1] - - -def test_mibench_quiet_flag_handling_bug(tmp_path, monkeypatch): - """ - Test mibench quiet flag handling demonstrates the bug where += splits the string. - - Args: - tmp_path (Path): Temporary directory for test files. - monkeypatch (pytest.MonkeyPatch): Pytest fixture for patching. - - Returns: - None: Asserts that the bug exists and characters are split. - """ - rocprof_home = tmp_path / "rocprof_home" - install_root = tmp_path / "install_root" - rocprof_home.mkdir(parents=True) - install_root.mkdir(parents=True) - - first_path = rocprof_home / "utils" / "rooflines" - first_path.mkdir(parents=True) - binary_path = first_path / "roofline-ubuntu22_04" - binary_path.write_text("#!/bin/bash\necho 'roofline executed'") - binary_path.chmod(0o755) - - class MockConfig: - def __init__(self): - self.rocprof_compute_home = self.MockPath(rocprof_home, install_root) - - class MockPath: - def __init__(self, home_path, install_path): - self._home_path = home_path - self._install_path = install_path - self.parent = self.MockParent(install_path) - - def __str__(self): - return str(self._home_path) - - def __truediv__(self, other): - return self._home_path / other - - class MockParent: - def __init__(self, install_path): - self.parent = install_path - - def __truediv__(self, other): - return self.parent / other - - mock_config = MockConfig() - - def mock_detect_roofline(mspec): - return {"distro": "22.04", "rocm_ver": "0.x.x"} - - subprocess_calls = [] - - def mock_subprocess_run(args, check=True): - subprocess_calls.append((args, check)) - - monkeypatch.setattr("utils.utils.detect_roofline", mock_detect_roofline) - monkeypatch.setattr("utils.utils.config", mock_config) - monkeypatch.setattr("subprocess.run", mock_subprocess_run) - monkeypatch.setattr("utils.utils.console_log", lambda *a, **k: None) - monkeypatch.setattr("pathlib.Path.exists", lambda *a, **k: True) - - import utils.utils as utils_mod - - class MockArgsQuiet: - path = str(tmp_path) - device = 0 - quiet = True - - class MockMspecQuiet: - pass - - utils_mod.mibench(MockArgsQuiet(), SimpleNamespace(rocm_version="0.x.x")) - - expected_base_args = [ - str(binary_path), - "-o", - str(tmp_path) + "/roofline.csv", - "-d", - "0", - ] - expected_full_args = expected_base_args + [ # noqa: F841 - "-", - "-", - "q", - "u", - "i", - "e", - "t", - ] - - subprocess_calls.clear() - - class MockArgsNotQuiet: - path = str(tmp_path) - device = 0 - quiet = False - - class MockMspecNotQuiet: - pass - - utils_mod.mibench(MockArgsQuiet(), SimpleNamespace(rocm_version="0.x.x")) - - expected_args = [ # noqa: F841 - str(binary_path), - "-o", - str(tmp_path) + "/roofline.csv", - "-d", - "0", - ] - - -def test_mibench_sles_distro_mapping(tmp_path, monkeypatch): - """ - Test mibench with SLES distro mapping. - - Args: - tmp_path (Path): Temporary directory for test files. - monkeypatch (pytest.MonkeyPatch): Pytest fixture for patching. - - Returns: - None: Asserts that SLES distro is correctly mapped. - """ - - class MockArgs: - path = str(tmp_path) - device = 3 - quiet = False - - class MockMspec: - pass - - rocprof_home = tmp_path / "rocprof_home" - install_root = tmp_path / "install_root" - rocprof_home.mkdir(parents=True) - install_root.mkdir(parents=True) - - first_path = rocprof_home / "utils" / "rooflines" - first_path.mkdir(parents=True) - binary_path = first_path / "roofline-sles15sp6" - binary_path.write_text("#!/bin/bash\necho 'roofline executed'") - binary_path.chmod(0o755) - - class MockConfig: - def __init__(self): - self.rocprof_compute_home = self.MockPath(rocprof_home, install_root) - - class MockPath: - def __init__(self, home_path, install_path): - self._home_path = home_path - self._install_path = install_path - self.parent = self.MockParent(install_path) - - def __str__(self): - return str(self._home_path) - - def __truediv__(self, other): - return self._home_path / other - - class MockParent: - def __init__(self, install_path): - self.parent = install_path - - def __truediv__(self, other): - return self.parent / other - - mock_config = MockConfig() - - def mock_detect_roofline(mspec): - return {"distro": "15.6", "rocm_ver": "0.x.x"} - - subprocess_calls = [] - - def mock_subprocess_run(args, check=True): - subprocess_calls.append((args, check)) - - monkeypatch.setattr("utils.utils.detect_roofline", mock_detect_roofline) - monkeypatch.setattr("utils.utils.config", mock_config) - monkeypatch.setattr("subprocess.run", mock_subprocess_run) - monkeypatch.setattr("utils.utils.console_log", lambda *a, **k: None) - monkeypatch.setattr("pathlib.Path.exists", lambda *a, **k: True) - - import utils.utils as utils_mod - - utils_mod.mibench(MockArgs(), SimpleNamespace(rocm_version="0.x.x")) - - assert len(subprocess_calls) == 1 - - -def test_mibench_subprocess_run_failure(tmp_path, monkeypatch): - """ - Test mibench when subprocess.run raises an exception. - - Args: - tmp_path (Path): Temporary directory for test files. - monkeypatch (pytest.MonkeyPatch): Pytest fixture for patching. - - Returns: - None: Asserts that subprocess exceptions are properly propagated. - """ - - class MockArgs: - path = str(tmp_path) - device = 0 - quiet = False - - class MockMspec: - pass - - override_binary_path = tmp_path / "failing_roofline" - override_binary_path.write_text("#!/bin/bash\nexit 1") - override_binary_path.chmod(0o755) - - def mock_detect_roofline(mspec): - return { - "distro": "override", - "path": str(override_binary_path), - "rocm_ver": "0.x.x", - } - - def mock_subprocess_run(args, check=True): - raise subprocess.CalledProcessError(1, args) - - monkeypatch.setattr("utils.utils.detect_roofline", mock_detect_roofline) - monkeypatch.setattr("subprocess.run", mock_subprocess_run) - monkeypatch.setattr("utils.utils.console_log", lambda *a, **k: None) - - import utils.utils as utils_mod - - with pytest.raises(subprocess.CalledProcessError): - utils_mod.mibench(MockArgs(), SimpleNamespace(rocm_version="0.x.x")) - - -def test_mibench_device_string_conversion(tmp_path, monkeypatch): - """ - Test mibench correctly converts device ID to string. - - Args: - tmp_path (Path): Temporary directory for test files. - monkeypatch (pytest.MonkeyPatch): Pytest fixture for patching. - - Returns: - None: Asserts that device ID is converted to string in subprocess args. - """ - - class MockArgs: - path = str(tmp_path) - device = 42 - quiet = False - - class MockMspec: - pass - - override_binary_path = tmp_path / "test_roofline" - override_binary_path.write_text("#!/bin/bash\necho 'success'") - override_binary_path.chmod(0o755) - - def mock_detect_roofline(mspec): - return { - "distro": "override", - "path": str(override_binary_path), - "rocm_ver": "0.x.x", - } - - subprocess_calls = [] - - def mock_subprocess_run(args, check=True): - subprocess_calls.append(args) - - monkeypatch.setattr("utils.utils.detect_roofline", mock_detect_roofline) - monkeypatch.setattr("subprocess.run", mock_subprocess_run) - monkeypatch.setattr("utils.utils.console_log", lambda *a, **k: None) - - import utils.utils as utils_mod - - utils_mod.mibench(MockArgs(), SimpleNamespace(rocm_version="0.x.x")) - - assert len(subprocess_calls) == 1 - device_arg_index = subprocess_calls[0].index("-d") + 1 - assert subprocess_calls[0][device_arg_index] == "42" - assert isinstance(subprocess_calls[0][device_arg_index], str) - - -def test_mibench_unknown_distro_mapping(tmp_path, monkeypatch): - """ - Test mibench behavior with unknown distro (should cause KeyError). - - Args: - tmp_path (Path): Temporary directory for test files. - monkeypatch (pytest.MonkeyPatch): Pytest fixture for patching. - - Returns: - None: Asserts that KeyError is raised for unknown distro. - """ - - class MockArgs: - path = str(tmp_path) - device = 0 - quiet = False - - class MockMspec: - pass - - rocprof_home = tmp_path / "rocprof_home" - install_root = tmp_path / "install_root" - rocprof_home.mkdir(parents=True) - install_root.mkdir(parents=True) - - class MockConfig: - def __init__(self): - self.rocprof_compute_home = self.MockPath(rocprof_home, install_root) - - class MockPath: - def __init__(self, home_path, install_path): - self._home_path = home_path - self._install_path = install_path - self.parent = self.MockParent(install_path) - - def __str__(self): - return str(self._home_path) - - def __truediv__(self, other): - return self._home_path / other - - class MockParent: - def __init__(self, install_path): - self.parent = install_path - - def __truediv__(self, other): - return self.parent / other - - mock_config = MockConfig() - - def mock_detect_roofline(mspec): - return {"distro": "unknown_distro", "rocm_ver": "0.x.x"} # Not in distro_map - - monkeypatch.setattr("utils.utils.detect_roofline", mock_detect_roofline) - monkeypatch.setattr("utils.utils.config", mock_config) - monkeypatch.setattr("utils.utils.console_log", lambda *a, **k: None) - - import utils.utils as utils_mod - - with pytest.raises(KeyError): - utils_mod.mibench(MockArgs(), SimpleNamespace(rocm_version="0.x.x")) - - -def test_mibench_console_log_called(tmp_path, monkeypatch): - """ - Test mibench calls console_log with correct message. - - Args: - tmp_path (Path): Temporary directory for test files. - monkeypatch (pytest.MonkeyPatch): Pytest fixture for patching. - - Returns: - None: Asserts that console_log is called with expected message. - """ - - class MockArgs: - path = str(tmp_path) - device = 0 - quiet = False - - class MockMspec: - pass - - override_binary_path = tmp_path / "test_roofline" - override_binary_path.write_text("#!/bin/bash\necho 'success'") - override_binary_path.chmod(0o755) - - def mock_detect_roofline(mspec): - return { - "distro": "override", - "path": str(override_binary_path), - "rocm_ver": "0.x.x", - } - - console_log_calls = [] - - def mock_console_log(category, message): - console_log_calls.append((category, message)) - - def mock_subprocess_run(args, check=True): - pass - - monkeypatch.setattr("utils.utils.detect_roofline", mock_detect_roofline) - monkeypatch.setattr("subprocess.run", mock_subprocess_run) - monkeypatch.setattr("utils.utils.console_log", mock_console_log) - - import utils.utils as utils_mod - - utils_mod.mibench(MockArgs(), SimpleNamespace(rocm_version="0.x.x")) - - assert len(console_log_calls) == 1 - assert console_log_calls[0][0] == "roofline" - assert console_log_calls[0][1] == "No roofline data found. Generating..." - - """ Normal Functionality: