[rocprofiler-compute] Add exception handling for native tool path search (#2159)
* Add exception handling for native tool path search * Fix formatting in roofline benchmark code * Fix detection of .so files * include hip code and native tool code in standalone binary * add fallback path for ROCM_PATH
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
4ff89b6fd1
Коммит
d8a8a3ef30
@@ -657,6 +657,8 @@ add_custom_target(
|
||||
--onefile-tempdir-spec=/{PRODUCT}/rocprof_compute_standalonebinary_{PID}
|
||||
--include-data-files=${PROJECT_SOURCE_DIR}/VERSION*=./ --enable-plugin=no-qt
|
||||
--include-data-files=src/lib/rocprofiler_compute_tool.cpp=lib/rocprofiler_compute_tool.cpp
|
||||
--include-data-files=src/lib/helper.cpp=lib/helper.cpp
|
||||
--include-data-files=src/lib/helper.hpp=lib/helper.hpp
|
||||
--include-package=dash_svg --include-package-data=dash_svg
|
||||
--include-package=dash_bootstrap_components
|
||||
--include-package-data=dash_bootstrap_components --include-package=plotly
|
||||
@@ -667,7 +669,8 @@ add_custom_target(
|
||||
--include-package-data=rocprof_compute_profile
|
||||
--include-package=rocprof_compute_tui --include-package-data=rocprof_compute_tui
|
||||
--include-package=rocprof_compute_soc --include-package-data=rocprof_compute_soc
|
||||
--include-package=utils --include-package-data=utils src/rocprof-compute
|
||||
--include-package=utils --include-package-data=utils
|
||||
--include-package=hip --include-package-data=hip src/rocprof-compute
|
||||
# Remove library rpath from executable
|
||||
COMMAND patchelf --remove-rpath rocprof-compute.bin
|
||||
# Move to build directory
|
||||
|
||||
@@ -26,6 +26,8 @@ CMD ["/bin/bash", "-c", "\
|
||||
--enable-plugin=no-qt \
|
||||
--include-data-files=VERSION*=./ \
|
||||
--include-data-files=src/lib/rocprofiler_compute_tool.cpp=lib/rocprofiler_compute_tool.cpp \
|
||||
--include-data-files=src/lib/helper.cpp=lib/helper.cpp \
|
||||
--include-data-files=src/lib/helper.hpp=lib/helper.hpp \
|
||||
--include-package=dash_svg --include-package-data=dash_svg \
|
||||
--include-package=dash_bootstrap_components \
|
||||
--include-package-data=dash_bootstrap_components \
|
||||
@@ -38,6 +40,7 @@ CMD ["/bin/bash", "-c", "\
|
||||
--include-package=rocprof_compute_tui --include-package-data=rocprof_compute_tui \
|
||||
--include-package=rocprof_compute_soc --include-package-data=rocprof_compute_soc \
|
||||
--include-package=utils --include-package-data=utils \
|
||||
--include-package=hip --include-package-data=hip \
|
||||
src/rocprof-compute \
|
||||
&& patchelf --remove-rpath rocprof-compute.bin \
|
||||
"]
|
||||
|
||||
@@ -37,8 +37,9 @@ from ctypes import (
|
||||
c_uint8,
|
||||
c_void_p,
|
||||
)
|
||||
import os
|
||||
|
||||
_lib = ctypes.CDLL("libamdhip64.so")
|
||||
_lib = ctypes.CDLL(f"{os.getenv('ROCM_PATH', '/opt/rocm')}/lib/libamdhip64.so")
|
||||
|
||||
|
||||
# Mirrors struct hipUUID_t
|
||||
@@ -299,7 +300,6 @@ class HIPModule:
|
||||
|
||||
|
||||
def hipGetDeviceCount() -> int:
|
||||
|
||||
device_count = c_int()
|
||||
status = _lib.hipGetDeviceCount(byref(device_count))
|
||||
|
||||
@@ -310,7 +310,6 @@ def hipGetDeviceCount() -> int:
|
||||
|
||||
|
||||
def hipGetDeviceProperties(device_id: int) -> HIPDeviceProperties:
|
||||
|
||||
props = HIPDeviceProperties()
|
||||
res = _lib.hipGetDevicePropertiesR0600(byref(props), device_id)
|
||||
|
||||
@@ -321,7 +320,6 @@ def hipGetDeviceProperties(device_id: int) -> HIPDeviceProperties:
|
||||
|
||||
|
||||
def hipMalloc(size: int) -> HIPDeviceMemory:
|
||||
|
||||
buf_size = c_size_t(size)
|
||||
ptr = c_void_p()
|
||||
|
||||
@@ -334,7 +332,6 @@ def hipMalloc(size: int) -> HIPDeviceMemory:
|
||||
|
||||
|
||||
def hipMemcpyHtoD(dst: HIPDeviceMemory, src: POINTER, size: int) -> None:
|
||||
|
||||
res = _lib.hipMemcpyHtoD(dst.ptr, src, size)
|
||||
|
||||
if res != 0:
|
||||
@@ -342,7 +339,6 @@ def hipMemcpyHtoD(dst: HIPDeviceMemory, src: POINTER, size: int) -> None:
|
||||
|
||||
|
||||
def hipMemcpyDtoH(dst: POINTER, src: HIPDeviceMemory, size: int) -> None:
|
||||
|
||||
res = _lib.hipMemcpyDtoH(dst, src.ptr, size)
|
||||
|
||||
if res != 0:
|
||||
@@ -350,7 +346,6 @@ def hipMemcpyDtoH(dst: POINTER, src: HIPDeviceMemory, size: int) -> None:
|
||||
|
||||
|
||||
def hipSetDevice(id: int) -> None:
|
||||
|
||||
status = _lib.hipSetDevice(id)
|
||||
|
||||
if status != 0:
|
||||
@@ -358,7 +353,6 @@ def hipSetDevice(id: int) -> None:
|
||||
|
||||
|
||||
def hipDeviceSynchronize() -> None:
|
||||
|
||||
res = _lib.hipDeviceSynchronize()
|
||||
|
||||
if res != 0:
|
||||
@@ -366,7 +360,6 @@ def hipDeviceSynchronize() -> None:
|
||||
|
||||
|
||||
def hipModuleLoadData(code: POINTER) -> HIPModule:
|
||||
|
||||
module = c_void_p()
|
||||
res = _lib.hipModuleLoadData(byref(module), code)
|
||||
|
||||
@@ -377,7 +370,6 @@ def hipModuleLoadData(code: POINTER) -> HIPModule:
|
||||
|
||||
|
||||
def hipModuleGetFunction(module: POINTER, name: str) -> POINTER:
|
||||
|
||||
name_bytes = name.encode("utf-8")
|
||||
func = c_void_p()
|
||||
|
||||
@@ -402,7 +394,6 @@ def hipModuleLaunchKernel(
|
||||
kernel_params: POINTER,
|
||||
extra: POINTER = None,
|
||||
) -> None:
|
||||
|
||||
res = _lib.hipModuleLaunchKernel(
|
||||
func,
|
||||
grid_dim_x,
|
||||
@@ -422,7 +413,6 @@ def hipModuleLaunchKernel(
|
||||
|
||||
|
||||
def hipEventCreate() -> HIPEvent:
|
||||
|
||||
handle = c_void_p()
|
||||
|
||||
res = _lib.hipEventCreate(byref(handle))
|
||||
@@ -434,7 +424,6 @@ def hipEventCreate() -> HIPEvent:
|
||||
|
||||
|
||||
def hipEventRecord(event: HIPEvent, stream: POINTER = None) -> None:
|
||||
|
||||
res = _lib.hipEventRecord(event.handle, stream)
|
||||
|
||||
if res != 0:
|
||||
@@ -442,7 +431,6 @@ def hipEventRecord(event: HIPEvent, stream: POINTER = None) -> None:
|
||||
|
||||
|
||||
def hipEventElapsedTime(start: HIPEvent, stop: HIPEvent) -> float:
|
||||
|
||||
ms = c_float()
|
||||
|
||||
res = _lib.hipEventElapsedTime(byref(ms), start.handle, stop.handle)
|
||||
|
||||
@@ -33,8 +33,9 @@ from ctypes import (
|
||||
c_size_t,
|
||||
c_void_p,
|
||||
)
|
||||
import os
|
||||
|
||||
_lib = ctypes.CDLL("libhiprtc.so")
|
||||
_lib = ctypes.CDLL(f"{os.getenv('ROCM_PATH', '/opt/rocm')}/lib/libhiprtc.so")
|
||||
|
||||
|
||||
_lib.hiprtcCreateProgram.restype = c_int
|
||||
@@ -112,7 +113,6 @@ class HIPRTCProgram:
|
||||
|
||||
# TODO: Handle headers
|
||||
def hiprtcCreateProgram(src: str, name: str) -> HIPRTCProgram:
|
||||
|
||||
src_bytes = src.encode("utf-8")
|
||||
name_bytes = name.encode("utf-8")
|
||||
|
||||
@@ -128,7 +128,6 @@ def hiprtcCreateProgram(src: str, name: str) -> HIPRTCProgram:
|
||||
|
||||
# TODO: Handle compile options
|
||||
def hiprtcCompileProgram(prog: HIPRTCProgram) -> None:
|
||||
|
||||
res = _lib.hiprtcCompileProgram(prog.handle, 0, None)
|
||||
|
||||
if res != 0:
|
||||
@@ -136,7 +135,6 @@ def hiprtcCompileProgram(prog: HIPRTCProgram) -> None:
|
||||
|
||||
|
||||
def hiprtcGetProgramLogSize(prog: HIPRTCProgram) -> int:
|
||||
|
||||
size = c_size_t(0)
|
||||
|
||||
res = _lib.hiprtcGetProgramLogSize(prog.handle, byref(size))
|
||||
@@ -148,7 +146,6 @@ def hiprtcGetProgramLogSize(prog: HIPRTCProgram) -> int:
|
||||
|
||||
|
||||
def hiprtcGetProgramLog(prog: HIPRTCProgram) -> str:
|
||||
|
||||
size = hiprtcGetProgramLogSize(prog)
|
||||
buf = (ctypes.c_char * size)()
|
||||
|
||||
@@ -171,7 +168,6 @@ def hiprtcGetCodeSize(prog: HIPRTCProgram) -> int:
|
||||
|
||||
|
||||
def hiprtcGetCode(prog: HIPRTCProgram) -> POINTER:
|
||||
|
||||
size = hiprtcGetCodeSize(prog)
|
||||
buf = (c_char * size)()
|
||||
res = _lib.hiprtcGetCode(prog.handle, buf)
|
||||
@@ -183,7 +179,6 @@ def hiprtcGetCode(prog: HIPRTCProgram) -> POINTER:
|
||||
|
||||
|
||||
def hiprtcGetLoweredName(prog: HIPRTCProgram, name_expression: str) -> str:
|
||||
|
||||
expr_bytes = name_expression.encode("utf-8")
|
||||
name_bytes = c_char_p()
|
||||
|
||||
@@ -196,7 +191,6 @@ def hiprtcGetLoweredName(prog: HIPRTCProgram, name_expression: str) -> str:
|
||||
|
||||
|
||||
def hiprtcAddNameExpression(prog: HIPRTCProgram, name_expression: str) -> None:
|
||||
|
||||
expr_bytes = name_expression.encode("utf-8")
|
||||
|
||||
res = _lib.hiprtcAddNameExpression(prog.handle, expr_bytes)
|
||||
|
||||
@@ -429,13 +429,20 @@ class RocProfCompute_Base:
|
||||
and not args.attach_pid
|
||||
):
|
||||
# Use native counter collection tool
|
||||
native_tool_path = str(
|
||||
Path(sys.argv[0]).resolve().parents[2]
|
||||
/ "lib"
|
||||
/ "rocprofiler-compute"
|
||||
/ "librocprofiler-compute-tool.so"
|
||||
)
|
||||
if not Path(native_tool_path).is_file():
|
||||
try:
|
||||
native_tool_path = str(
|
||||
Path(sys.argv[0]).resolve().parents[2]
|
||||
/ "lib"
|
||||
/ "rocprofiler-compute"
|
||||
/ "librocprofiler-compute-tool.so"
|
||||
)
|
||||
except Exception as e:
|
||||
console_debug(
|
||||
f"Could not find pre-built native tool: {e}. "
|
||||
"Building native tool now."
|
||||
)
|
||||
native_tool_path = None
|
||||
if not (native_tool_path and Path(native_tool_path).is_file()):
|
||||
# Build native counter collection tool if not exists
|
||||
native_tool_path = str(
|
||||
Path(
|
||||
|
||||
@@ -170,7 +170,6 @@ DEFAULT_DATASET_SIZE = 512 * 1024 * 1024
|
||||
|
||||
|
||||
def show_progress(pct: float) -> None:
|
||||
|
||||
bar_char = "|"
|
||||
bar_size = 60
|
||||
|
||||
@@ -182,7 +181,6 @@ def show_progress(pct: float) -> None:
|
||||
|
||||
# Returns a named tuple with the mean, std deviation and confidence
|
||||
def calc_stats(samples: list) -> Stats:
|
||||
|
||||
mean = sum(samples) / len(samples)
|
||||
|
||||
stdev = 0.0
|
||||
@@ -213,7 +211,6 @@ class Program:
|
||||
self.module = hip.hipModuleLoadData(self.code)
|
||||
|
||||
def get_kernel(self, kernel_name: str) -> POINTER:
|
||||
|
||||
# TODO: Why doesn't hiprtcGetLoweredName work with non-template functions?
|
||||
if "<" in kernel_name:
|
||||
kernel_name = hiprtc.hiprtcGetLoweredName(self.prog, kernel_name)
|
||||
@@ -230,7 +227,6 @@ def launch_kernel(
|
||||
stream: POINTER,
|
||||
args: list[Any] = [],
|
||||
) -> None:
|
||||
|
||||
# Convert to native types
|
||||
args_converted = []
|
||||
for arg in args:
|
||||
@@ -262,7 +258,6 @@ def launch_kernel(
|
||||
|
||||
# Retrieve the gfx architecture
|
||||
def get_gfx_arch(device: int) -> str:
|
||||
|
||||
arch_str = hip.hipGetDeviceProperties(device).gcnArchName
|
||||
|
||||
# Parse out only gfx
|
||||
@@ -280,7 +275,6 @@ def run_get_samples(
|
||||
stream: POINTER,
|
||||
args: list[Any] = [],
|
||||
) -> list[float]:
|
||||
|
||||
event_start = hip.hipEventCreate()
|
||||
event_stop = hip.hipEventCreate()
|
||||
|
||||
@@ -405,8 +399,12 @@ def hbm_bw_benchmark(device: int) -> PerfMetrics:
|
||||
perf_metrics = PerfMetrics(mean, mean - stats.confidence, mean + stats.confidence)
|
||||
|
||||
event_ms = total_bytes / mean / 1e6
|
||||
|
||||
print(
|
||||
f"HBM BW, GPU ID: {device}, workgroupSize:{workgroup_size}, workgroups:{workgroups}, experiments:{num_experiments}, traffic:{total_bytes} bytes, duration:{event_ms:.1f} ms, mean:{mean:.1f} GB/sec, stdev={stdev:.1f} GB/sec"
|
||||
f"HBM BW, GPU ID: {device}, workgroupSize:{workgroup_size}, "
|
||||
f"workgroups:{workgroups}, experiments:{num_experiments}, "
|
||||
f"traffic:{total_bytes} bytes, duration:{event_ms:.1f} ms, "
|
||||
f"mean:{mean:.1f} GB/sec, stdev:{stdev:.1f} GB/sec"
|
||||
)
|
||||
|
||||
return perf_metrics
|
||||
@@ -463,7 +461,10 @@ def cache_bw_bench(device: int, type: str, iters: int) -> PerfMetrics:
|
||||
event_ms = total_bytes / mean / 1e6
|
||||
|
||||
print(
|
||||
f"{type} BW, GPU ID: {device}, workgroupSize:{workgroup_size}, workgroups:{workgroups}, experiments:{num_experiments}, traffic:{total_bytes} bytes, duration:{event_ms:.1f} ms, mean:{mean:.1f} GB/sec, stdev={stdev:1f} GB/sec"
|
||||
f"{type} BW, GPU ID: {device}, workgroupSize:{workgroup_size}, "
|
||||
f"workgroups:{workgroups}, experiments:{num_experiments}, "
|
||||
f"traffic:{total_bytes} bytes, duration:{event_ms:.1f} ms, "
|
||||
f"mean:{mean:.1f} GB/sec, stdev:{stdev:1f} GB/sec"
|
||||
)
|
||||
|
||||
return perf_metrics
|
||||
@@ -511,7 +512,6 @@ extern "C" __global__ void LDS_bw(int numIter, float *dummy)
|
||||
|
||||
|
||||
def lds_bw_benchmark(device: int) -> PerfMetrics:
|
||||
|
||||
num_experiments = DEFAULT_NUM_EXPERIMENTS
|
||||
workgroup_size = DEFAULT_WORKGROUP_SIZE
|
||||
|
||||
@@ -553,7 +553,10 @@ def lds_bw_benchmark(device: int) -> PerfMetrics:
|
||||
event_ms = total_bytes / mean / 1e6
|
||||
|
||||
print(
|
||||
f"LDS BW, GPU ID: {device}, workgroupSize:{workgroup_size}, workgroups:{workgroups}, experiments:{num_experiments}, traffic:{total_bytes} bytes, duration:{event_ms:.1f} ms, mean:{mean:.1f} GB/sec, stdev={stdev:1f} GB/sec"
|
||||
f"LDS BW, GPU ID: {device}, workgroupSize:{workgroup_size}, "
|
||||
f"workgroups:{workgroups}, experiments:{num_experiments}, "
|
||||
f"traffic:{total_bytes} bytes, duration:{event_ms:.1f} ms, "
|
||||
f"mean:{mean:.1f} GB/sec, stdev:{stdev:1f} GB/sec"
|
||||
)
|
||||
|
||||
return perf_metrics
|
||||
@@ -635,7 +638,9 @@ def flops_bench(device: int, type: str, unit: str, rate: int) -> PerfMetrics:
|
||||
event_ms = total_flops / mean / 1e6
|
||||
|
||||
print(
|
||||
f"Peak {unit}s ({type}), GPU ID: {device}, workgroupSize:{workgroup_size}, workgroups:{workgroups}, experiments:{num_experiments}, {unit}:{total_flops}, duration:{event_ms:.1f} ms, mean:{mean:.1f} {rate}, stdev={stdev:.1f} GFLOPS"
|
||||
f"workgroups:{workgroups}, experiments:{num_experiments}, "
|
||||
f"{unit}:{total_flops}, duration:{event_ms:.1f} ms, "
|
||||
f"mean:{mean:.1f} {rate}, stdev={stdev:.1f} GFLOPS"
|
||||
)
|
||||
|
||||
return perf_metrics
|
||||
@@ -862,30 +867,80 @@ template<int datatype> __global__ void mfma_f8f6f4(int iter, float *dummy)
|
||||
case FP8_E4M3: // fp8 x fp8
|
||||
for(int i = 0; i < iter; ++i)
|
||||
{
|
||||
result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, a, result, 0, 0, 0, 0, 0, 0);
|
||||
}
|
||||
result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
|
||||
a,
|
||||
a,
|
||||
result,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0
|
||||
);
|
||||
}
|
||||
case BF8_E5M2: // bf8 x bf8
|
||||
for(int i = 0; i < iter; ++i)
|
||||
{
|
||||
result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, a, result, 1, 1, 0, 0, 0, 0);
|
||||
result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
|
||||
a,
|
||||
a,
|
||||
result,
|
||||
1,
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0
|
||||
);
|
||||
}
|
||||
break;
|
||||
case FP6_E2M3: // fp6 x fp6
|
||||
for(int i = 0; i < iter; ++i)
|
||||
{
|
||||
result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, a, result, 2, 2, 0, 0, 0, 0);
|
||||
result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
|
||||
a,
|
||||
a,
|
||||
result,
|
||||
2,
|
||||
2,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0
|
||||
);
|
||||
}
|
||||
break;
|
||||
case BF6_E3M2: // bf6 x bf6
|
||||
for(int i = 0; i < iter; ++i)
|
||||
{
|
||||
result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, a, result, 3, 3, 0, 0, 0, 0);
|
||||
result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
|
||||
a,
|
||||
a,
|
||||
result,
|
||||
3,
|
||||
3,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0
|
||||
);
|
||||
}
|
||||
break;
|
||||
case FP4_E2M1: // fp4 x fp4
|
||||
for(int i = 0; i < iter; ++i)
|
||||
{
|
||||
result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a, a, result, 4, 4, 0, 0, 0, 0);
|
||||
result = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(
|
||||
a,
|
||||
a,
|
||||
result,
|
||||
4,
|
||||
4,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0
|
||||
);
|
||||
}
|
||||
break;
|
||||
}
|
||||
@@ -954,79 +1009,69 @@ def mfma_bench(device: int, type: str, unit: str, rate: int) -> PerfMetrics:
|
||||
event_ms = total_flops / mean / 1e6
|
||||
|
||||
print(
|
||||
f"Peak MFMA {unit}s ({type}), GPU ID: {device}, workgroupSize:{workgroup_size}, workgroups:{workgroups}, experiments:{experiments}, {unit}:{total_flops}, duration:{event_ms:.2f} ms, mean:{mean:.1f} {rate}, stdev={stdev:.1f} GFLOPS"
|
||||
f"Peak MFMA {unit}s ({type}), GPU ID: {device}, "
|
||||
f"workgroupSize:{workgroup_size}, workgroups:{workgroups}, "
|
||||
f"experiments:{experiments}, {unit}:{total_flops}, "
|
||||
f"duration:{event_ms:.2f} ms, mean:{mean:.1f} {rate}, "
|
||||
f"stdev:{stdev:.1f} GFLOPS"
|
||||
)
|
||||
|
||||
return perf_metrics
|
||||
|
||||
|
||||
def mfma_f32_bench(device: int) -> PerfMetrics:
|
||||
|
||||
return mfma_bench(device, "F32", "FLOP", "GFLOPS")
|
||||
|
||||
|
||||
def mfma_f16_bench(device: int) -> PerfMetrics:
|
||||
|
||||
return mfma_bench(device, "F16", "FLOP", "GFLOPS")
|
||||
|
||||
|
||||
def mfma_bf16_bench(device: int) -> PerfMetrics:
|
||||
|
||||
return mfma_bench(device, "BF16", "FLOP", "GFLOPS")
|
||||
|
||||
|
||||
def mfma_f64_bench(device: int) -> PerfMetrics:
|
||||
|
||||
return mfma_bench(device, "F64", "FLOP", "GFLOPS")
|
||||
|
||||
|
||||
def mfma_f8_bench(device: int) -> PerfMetrics:
|
||||
|
||||
return mfma_bench(device, "F8", "FLOP", "GFLOPS")
|
||||
|
||||
|
||||
def mfma_i8_bench(device: int) -> PerfMetrics:
|
||||
|
||||
return mfma_bench(device, "I8", "IOP", "GOPS")
|
||||
|
||||
|
||||
def mfma_f4_bench(device: int) -> PerfMetrics:
|
||||
|
||||
return mfma_bench(device, "F4", "FLOP", "GFLOPS")
|
||||
|
||||
|
||||
def mfma_f6_bench(device: int) -> PerfMetrics:
|
||||
|
||||
return mfma_bench(device, "F6", "FLOP", "GFLOPS")
|
||||
|
||||
|
||||
def fp16_benchmark(device: int) -> PerfMetrics:
|
||||
|
||||
return flops_bench(device, "FP16", "FLOP", "GFLOPS")
|
||||
|
||||
|
||||
def fp32_benchmark(device: int) -> PerfMetrics:
|
||||
|
||||
return flops_bench(device, "FP32", "FLOP", "GFLOPS")
|
||||
|
||||
|
||||
def fp64_benchmark(device: int) -> PerfMetrics:
|
||||
|
||||
return flops_bench(device, "FP64", "FLOP", "GFLOPS")
|
||||
|
||||
|
||||
def int8_benchmark(device: int) -> PerfMetrics:
|
||||
|
||||
return flops_bench(device, "INT8", "IOP", "GOPS")
|
||||
|
||||
|
||||
def int32_benchmark(device: int) -> PerfMetrics:
|
||||
|
||||
return flops_bench(device, "INT32", "IOP", "GOPS")
|
||||
|
||||
|
||||
def int64_benchmark(device: int) -> PerfMetrics:
|
||||
|
||||
return flops_bench(device, "INT64", "IOP", "GOPS")
|
||||
|
||||
|
||||
@@ -1055,7 +1100,6 @@ tests = {
|
||||
|
||||
# Run the roofine tests on the specified device
|
||||
def run_benchmark(device: int) -> dict[PerfMetrics]:
|
||||
|
||||
metrics_dict = {}
|
||||
|
||||
arch = get_gfx_arch(device)
|
||||
@@ -1079,7 +1123,6 @@ def run_benchmark(device: int) -> dict[PerfMetrics]:
|
||||
# Returns a dictionary mapping device ID to dictionary of
|
||||
# metrics
|
||||
def run_on_devices(devices: list[int]) -> dict[dict[PerfMetrics]]:
|
||||
|
||||
metrics = {}
|
||||
for d in devices:
|
||||
metrics[d] = run_benchmark(d)
|
||||
|
||||
Ссылка в новой задаче
Block a user