Fix hbmBW calculation, (#243)

* Fix hbmBW calculation, currently we're reading the wrong value of the mclk out of rocm-smi

Signed-off-by: Nicholas Curtis <nicurtis@amd.com>

* Fix for earlier smi

Signed-off-by: Nicholas Curtis <nicurtis@amd.com>

---------

Signed-off-by: Nicholas Curtis <nicurtis@amd.com>
Co-authored-by: Nicholas Curtis <nicurtis@amd.com>

[ROCm/rocprofiler-compute commit: c60f031558]
Dieser Commit ist enthalten in:
Nick Curtis
2024-02-13 21:19:56 -05:00
committet von GitHub
Ursprung a5b1082836
Commit 0dcd6e35dd
3 geänderte Dateien mit 47 neuen und 24 gelöschten Zeilen
@@ -30,7 +30,7 @@ import os
import pandas as pd
import numpy as np
from utils import schema
from utils.utils import error, get_hbm_stack_num
from utils.utils import error
from pathlib import Path
import logging
@@ -409,24 +409,7 @@ def calc_builtin_var(var, sys_info):
if isinstance(var, int):
return var
elif isinstance(var, str) and var.startswith("$totalL2Banks"):
# Fixme: support all supported partitioning mode
# Fixme: "name" is a bad name!
totalL2Banks = sys_info.L2Banks
if (
sys_info["name"].lower() == "mi300a_a0"
or sys_info["name"].lower() == "mi300a_a1"
):
totalL2Banks = sys_info.L2Banks * get_hbm_stack_num(
sys_info["name"], sys_info["memory_partition"]
)
elif (
sys_info["name"].lower() == "mi300x_a0"
or sys_info["name"].lower() == "mi300x_a1"
):
totalL2Banks = sys_info.L2Banks * get_hbm_stack_num(
sys_info["name"], sys_info["memory_partition"]
)
return totalL2Banks
return sys_info.totalL2Banks
else:
print("Don't support", var)
sys.exit(1)
@@ -1014,6 +997,7 @@ def correct_sys_info(df, specs_correction):
"cur_sclk": "cur_sclk",
"cur_mclk": "cur_mclk",
"L2Banks": "L2Banks",
"totalL2Banks": "totalL2Banks",
"LDSBanks": "LDSBanks",
"numSQC": "numSQC",
"numPipes": "numPipes",
@@ -35,7 +35,7 @@ import logging
from dataclasses import dataclass
from pathlib import Path as path
from textwrap import dedent
from utils.utils import error
from utils.utils import error, get_hbm_stack_num
@dataclass
class MachineSpecs:
@@ -57,10 +57,12 @@ class MachineSpecs:
wave_size: str
workgroup_max_size: str
max_sclk: str
max_mclk: str
cur_sclk: str
cur_mclk: str
max_waves_per_cu: str
L2Banks: str
totalL2Banks: str
LDSBanks: str
numSQC: str
numPipes: str
@@ -86,6 +88,7 @@ class MachineSpecs:
L1: {self.L1} KB
L2: {self.L2} KB
max_sclk: {self.max_sclk} MHz
max_mclk: {self.max_mclk} MHz
cur_sclk: {self.cur_sclk} MHz
cur_mclk: {self.cur_mclk} MHz
CU: {self.CU}
@@ -95,6 +98,7 @@ class MachineSpecs:
workgroup_max_size: {self.workgroup_max_size}
max_waves_per_cu: {self.max_waves_per_cu}
L2Banks: {self.L2Banks}
totalL2Banks: {self.totalL2Banks}
LDSBanks: {self.LDSBanks}
numSQC: {self.numSQC}
numPipes: {self.numPipes}
@@ -114,6 +118,7 @@ def gpuinfo():
"L1": None,
"L2": None,
"max_sclk": None,
"max_mclk": None,
"num_CU": None,
"num_SIMD": None,
"numPipes": None,
@@ -128,6 +133,10 @@ def gpuinfo():
"memory_partition": None,
}
# we get the max mclk from rocm-smi --showmclkrange
rocm_smi_mclk = run(["rocm-smi", "--showmclkrange"], exit_on_error=True)
gpu_info['max_mclk'] = search(r'(\d+)Mhz\s*$', rocm_smi_mclk)
# Fixme: find better way to differentiate cards, GPU vs APU, etc.
rocminfo_full = run(["rocminfo"])
rocminfo = rocminfo_full.split("\n")
@@ -246,6 +255,23 @@ def search(pattern, string):
return m.group(1)
return None
def total_l2_banks(archname, L2Banks, memory_partition):
# Fixme: support all supported partitioning mode
# Fixme: "name" is a bad name!
totalL2Banks = L2Banks
if (
archname.lower() == "mi300a_a0"
or archname.lower() == "mi300a_a1"
):
totalL2Banks = L2Banks * get_hbm_stack_num(
archname, memory_partition)
elif (
archname.lower() == "mi300x_a0"
or archname.lower() == "mi300x_a1"
):
totalL2Banks = L2Banks * get_hbm_stack_num(
archname, memory_partition)
return totalL2Banks
def get_machine_specs(devicenum):
cpuinfo = path("/proc/cpuinfo").read_text()
@@ -323,9 +349,6 @@ def get_machine_specs(devicenum):
# FIXME with device
vbios = search(r"VBIOS version: (.*?)$", run(["rocm-smi", "-v"], exit_on_error=True))
# FIXME with spec
hbmBW = str(int(cur_mclk) / 1000 * 4096 / 8 * 2)
compute_partition = search(
r"Compute Partition:\s*(\w+)", run(["rocm-smi", "--showcomputepartition"])
)
@@ -338,6 +361,18 @@ def get_machine_specs(devicenum):
if memory_partition == None:
memory_partition = "NA"
totalL2Banks = total_l2_banks(
gpu_info['gpu_name'], int(gpu_info['L2Banks']), memory_partition)
hbmchannels = totalL2Banks
if (
gpu_info['gpu_name'].lower() == "mi300a_a0"
or gpu_info['gpu_name'].lower() == "mi300a_a1"
) and memory_partition.lower() == "nps1":
# we have an extra 32 channels for the CCD
hbmchannels += 32
hbmBW = str(int(gpu_info['max_mclk']) / 1000 * 32 * hbmchannels)
totalL2Banks = str(totalL2Banks)
return MachineSpecs(
hostname,
CPU,
@@ -357,10 +392,12 @@ def get_machine_specs(devicenum):
gpu_info['wave_size'],
gpu_info['grp_size'],
gpu_info['max_sclk'],
gpu_info['max_mclk'],
cur_sclk,
cur_mclk,
gpu_info['max_waves_per_cu'],
gpu_info['L2Banks'],
totalL2Banks,
gpu_info['LDSBanks'],
gpu_info['numSQC'],
gpu_info['numPipes'],
@@ -299,7 +299,8 @@ def gen_sysinfo(workload_name, workload_dir, ip_blocks, app_cmd, skip_roof, roof
header += "command,"
header += "host_name,host_cpu,sbios,host_distro,host_kernel,host_rocmver,date,"
header += "gpu_soc,vbios,numSE,numCU,numSIMD,waveSize,maxWavesPerCU,maxWorkgroupSize,"
header += "L1,L2,sclk,mclk,cur_sclk,cur_mclk,L2Banks,LDSBanks,name,numSQC,numPipes,hbmBW,compute_partition,memory_partition,"
header += "L1,L2,sclk,mclk,cur_sclk,cur_mclk,L2Banks,totalL2Banks,LDSBanks,name,numSQC,numPipes,"
header += "hbmBW,compute_partition,memory_partition,"
header += "ip_blocks\n"
sysinfo.write(header)
@@ -341,6 +342,7 @@ def gen_sysinfo(workload_name, workload_dir, ip_blocks, app_cmd, skip_roof, roof
mspec.cur_sclk,
mspec.cur_mclk,
mspec.L2Banks,
mspec.totalL2Banks,
mspec.LDSBanks,
mspec.GPU,
mspec.numSQC,