Enhance correct_sys_info() func and err checking

Signed-off-by: colramos-amd <colramos@amd.com>


[ROCm/rocprofiler-compute commit: c17e39189f]
This commit is contained in:
colramos-amd
2024-02-27 12:15:37 -06:00
gecommit door Cole Ramos
bovenliggende 98ecb3b590
commit 78b668f128
4 gewijzigde bestanden met toevoegingen van 35 en 65 verwijderingen
@@ -41,8 +41,8 @@ class OmniAnalyze_Base:
self._runs = OrderedDict()
self._arch_configs = {}
self.__supported_archs = supported_archs
self._output = None
self.__socs = None # available OmniSoC objs
self._output = None
self.__socs:dict = None # available OmniSoC objs
def get_args(self):
return self.__args
@@ -150,12 +150,11 @@ class OmniAnalyze_Base:
for d in self.__args.path:
w = schema.Workload()
w.sys_info = file_io.load_sys_info(Path(d[0], "sysinfo.csv"))
if self.__args.specs_correction:
w.sys_info = parser.correct_sys_info(
w.sys_info, self.__args.specs_correction
)
w.avail_ips = w.sys_info["ip_blocks"].item().split("|")
arch = w.sys_info.iloc[0]["gpu_arch"]
mspec = self.get_socs()[arch]._mspec
if self.__args.specs_correction:
w.sys_info = parser.correct_sys_info(mspec, self.__args.specs_correction)
w.avail_ips = w.sys_info["ip_blocks"].item().split("|")
w.dfs = copy.deepcopy(self._arch_configs[arch].dfs)
w.dfs_type = self._arch_configs[arch].dfs_type
self._runs[d[0]] = w
@@ -156,7 +156,7 @@ class Omniperf:
return
@demarcate
def load_soc_specs(self, sysinfo=None):
def load_soc_specs(self, sysinfo:dict=None):
"""Load OmniSoC instance for Omniperf run
"""
self.__mspec = MachineSpecs(self.__args, sysinfo)
@@ -278,6 +278,7 @@ class Omniperf:
# Load required SoC(s) from input
for d in analyzer.get_args().path:
sys_info = pd.read_csv(Path(d[0], "sysinfo.csv"))
sys_info = sys_info.to_dict('list')
self.load_soc_specs(sys_info)
analyzer.set_soc(self.__soc)
@@ -912,9 +912,12 @@ def load_kernel_top(workload, dir):
if file.exists():
tmp[id] = pd.read_csv(file)
else:
logging.info(
"Warning: Issue loading top kernels. Check pmc_kernel_top.csv"
)
logging.info("Warning: Issue loading top kernels. Check pmc_kernel_top.csv")
# NB: Special case for sysinfo. Probably room for improvement in this whole function design
elif "from_csv_columnwise" in df.columns and id == 101:
tmp[id] = workload.sys_info.transpose()
# All transposed columns should be marked with a general header
tmp[id].columns = ["Info"]
elif "from_csv_columnwise" in df.columns:
# NB:
# Another way might be doing transpose in tty like metric_table.
@@ -962,60 +965,18 @@ def build_comparable_columns(time_unit):
return comparable_columns
def correct_sys_info(df, specs_correction):
def correct_sys_info(mspec, specs_correction:dict):
"""
Correct system spec items manually
"""
# NB: to keep the backwards compatibility, we don't touch the current
# naming convention. Ideally, the header of sysinfo should use/include
# the members of MachineSpecs directly.
# Sync up with the header defined in omniperf gen_sysinfo() !!
# header = "workload_name,"
# header += "command,"
# header += "host_name,host_cpu,host_distro,host_kernel,host_rocmver,date,"
# header += "gpu_soc,numSE,numCU,numSIMD,waveSize,maxWavesPerCU,maxWorkgroupSize,"
# header += "L1,L2,sclk,mclk,cur_sclk,cur_mclk,L2Banks,LDSBanks,name,numSQC,numPipes,hbmBW,compute_partition,memory_partition,"
# header += "ip_blocks\n"
name_map = {
"host_name": "hostname",
"CPU": "host_cpu",
"kernel_version": "host_kernel",
"host_distro": "distro",
# "ram": "",
"distro": "host_distro",
"rocm_version": "host_rocmver",
"GPU": "name",
"arch": "gpu_soc",
"L1": "L1",
"L2": "L2",
"CU": "numCU",
"SIMD": "numSIMD",
"SE": "numSE",
"wave_size": "waveSize",
"max_waves_per_cu": "maxWavesPerCU",
"max_waves_per_cu": "maxWorkgroupSize",
"max_sclk": "sclk",
"max_mclk": "mclk",
"cur_sclk": "cur_sclk",
"cur_mclk": "cur_mclk",
"L2Banks": "L2Banks",
"totalL2Banks": "totalL2Banks",
"LDSBanks": "LDSBanks",
"numSQC": "numSQC",
"numPipes": "numPipes",
"hbmBW": "hbmBW",
"compute_partition": "compute_partition",
"memory_partition": "memory_partition",
"num_xcd": "num_xcd"
}
# todo: more err checking for string specs_correction
pairs = dict(re.findall(r"(\w+):\s*(\d+)", specs_correction))
for k, v in pairs.items():
df[name_map[k]] = v
return df
pairs = dict(re.findall(r"(\w+):\s*(\d+)", specs_correction))
for k, v in pairs.items():
if not hasattr(mspec, str(k)):
error(f"Invalid specs correction '{k}'. Please use --specs option to peak valid specs")
setattr(mspec, str(k), v)
return mspec.get_class_members()
@@ -53,9 +53,10 @@ VERSION_LOC = [
@dataclass
class MachineSpecs:
def __init__(self, args, sysinfo=None):
def __init__(self, args, sysinfo:dict=None):
if not sysinfo is None:
self.gpu_arch = sysinfo.iloc[0]["gpu_arch"]
for key, value in sysinfo.items():
setattr(self, key, value[0])
return
# read timestamp info
now = datetime.now()
@@ -254,8 +255,16 @@ def get_rocm_ver():
error("Unable to detect a complete local ROCm installation.\nThe expected %s/.info/ versioning directory is missing. Please ensure you have valid ROCm installation." % _rocm_path)
return rocm_ver
<<<<<<< HEAD
def run(cmd, exit_on_error=False):
p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
=======
def run(cmd,exit_on_error=False):
try:
p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
except FileNotFoundError as e:
error(f"Unable to parse specs. Can't find ROCm asset: {e.filename}\nTry passing a path to an existing workload results in 'analyze' mode.")
>>>>>>> 2d92bcf (Enhance correct_sys_info() func and err checking)
if exit_on_error:
if cmd[0] == "rocm-smi":