Add the ability to determine GPU model from Chip ID (#423)

* Add the ability to determine GPU model from Chip ID for distinguishing MI300 systems by using a built-in dictionary.

Signed-off-by: xuchen-amd <xuchen@amd.com>

* Add support for MI300X_A1

Signed-off-by: xuchen-amd <xuchen@amd.com>

* Remove MI308X identification using num CUs, and format Python using black.

Signed-off-by: xuchen-amd <xuchen@amd.com>

* Add Read the Docs

Signed-off-by: xuchen-amd <xuchen@amd.com>

* Add sphinx requirement

Signed-off-by: xuchen-amd <xuchen@amd.com>

* Remove gpu_model identification using gpu_arch

Signed-off-by: xuchen-amd <xuchen@amd.com>

* Remove OMNIPERF_ARCH_OVERRIDE and its usage. Determining MI300 gpu model solely based on chip id.

Signed-off-by: xuchen-amd <xuchen@amd.com>

* Fix Python formatting using black.

Signed-off-by: xuchen-amd <xuchen@amd.com>

---------

Signed-off-by: xuchen-amd <xuchen@amd.com>
이 커밋은 다음에 포함됨:
xuchen-amd
2024-09-24 16:53:36 -04:00
커밋한 사람 Cole Ramos
부모 99680a7833
커밋 c9773c157e
3개의 변경된 파일22개의 추가작업 그리고 27개의 파일을 삭제
+6
파일 보기
@@ -55,6 +55,12 @@ SUPPORTED_ARCHS = {
"gfx942": {"mi300": ["MI300A_A1", "MI300X_A1"]},
}
MI300_CHIP_IDS = {
"29856": "MI300A_A1",
"29857": "MI300X_A1",
"29858": "MI308X",
}
class Omniperf:
def __init__(self):
+9 -27
파일 보기
@@ -34,6 +34,7 @@ from pathlib import Path
from collections import OrderedDict
from omniperf_base import SUPPORTED_ARCHS
from omniperf_base import MI300_CHIP_IDS
class OmniSoC_Base:
@@ -100,11 +101,6 @@ class OmniSoC_Base:
# assume no SoC specific options and return empty list by default
return []
def check_arch_override(self):
if "OMNIPERF_ARCH_OVERRIDE" in os.environ.keys():
return os.environ["OMNIPERF_ARCH_OVERRIDE"]
return ""
@demarcate
def populate_mspec(self):
from utils.specs import search, run, total_sqc, total_xcds
@@ -156,6 +152,11 @@ class OmniSoC_Base:
self._mspec.workgroup_max_size = key
continue
key = search(r"^\s*Chip ID:\s+ ([a-zA-Z0-9]+)\s*", linetext)
if key != None:
self._mspec.chip_id = key
continue
key = search(r"^\s*Max Waves Per CU:\s+ ([a-zA-Z0-9]+)\s*", linetext)
if key != None:
self._mspec.max_waves_per_cu = key
@@ -181,28 +182,9 @@ class OmniSoC_Base:
0
].upper()
if self._mspec.gpu_model == "MI300":
self._mspec.gpu_model = list(SUPPORTED_ARCHS[self._mspec.gpu_arch].values())[
0
][0]
if self._mspec.gpu_arch == "gfx942":
if (
"MI300A" in "\n".join(self._mspec._rocminfo)
or "MI300A" in self.check_arch_override()
):
self._mspec.gpu_model = "MI300A_A1"
elif (
"MI300X" in "\n".join(self._mspec._rocminfo)
or "MI300X" in self.check_arch_override()
):
self._mspec.gpu_model = "MI300X_A1"
# We need to distinguish MI308X by peeking reported num CUs
elif self._mspec.cu_per_gpu == "80" or "MI308X" in self.check_arch_override():
self._mspec.gpu_model = "MI308X"
else:
console_error(
"Cannot parse MI300 details from rocminfo. Please verify output or set the arch using (e.g.,) "
'export OMNIPERF_ARCH_OVERRIDE="MI300A"'
)
# Use Chip ID to distinguish MI300 gpu model using the built-in dictionary
if self._mspec.chip_id in MI300_CHIP_IDS:
self._mspec.chip_id = MI300_CHIP_IDS[self._mspec.chip_id]
self._mspec.num_xcd = str(
total_xcds(self._mspec.gpu_model, self._mspec.compute_partition)
+7
파일 보기
@@ -403,6 +403,13 @@ class MachineSpecs:
"name": "Workgroup Max Size",
},
)
chip_id: str = field(
default=None,
metadata={
"doc": "<>",
"name": "Chip ID",
},
)
max_waves_per_cu: str = field(
default=None,
metadata={