Add the ability to determine GPU model from Chip ID (#423)

* Add the ability to determine GPU model from Chip ID for distinguishing MI300 systems by using a built-in dictionary. Signed-off-by: xuchen-amd <xuchen@amd.com> * Add support for MI300X_A1 Signed-off-by: xuchen-amd <xuchen@amd.com> * Remove MI308X identification using num CUs, and format Python using black. Signed-off-by: xuchen-amd <xuchen@amd.com> * Add Read the Docs Signed-off-by: xuchen-amd <xuchen@amd.com> * Add sphinx requirement Signed-off-by: xuchen-amd <xuchen@amd.com> * Remove gpu_model identification using gpu_arch Signed-off-by: xuchen-amd <xuchen@amd.com> * Remove OMNIPERF_ARCH_OVERRIDE and its usage. Determining MI300 gpu model solely based on chip id. Signed-off-by: xuchen-amd <xuchen@amd.com> * Fix Python formatting using black. Signed-off-by: xuchen-amd <xuchen@amd.com> --------- Signed-off-by: xuchen-amd <xuchen@amd.com>
2024-09-24 16:53:36 -04:00
@@ -55,6 +55,12 @@ SUPPORTED_ARCHS = {
    "gfx942": {"mi300": ["MI300A_A1", "MI300X_A1"]},
 }

+MI300_CHIP_IDS = {
+    "29856": "MI300A_A1",
+    "29857": "MI300X_A1",
+    "29858": "MI308X",
+}
+

 class Omniperf:
    def __init__(self):
@@ -34,6 +34,7 @@ from pathlib import Path
 from collections import OrderedDict

 from omniperf_base import SUPPORTED_ARCHS
+from omniperf_base import MI300_CHIP_IDS


 class OmniSoC_Base:
@@ -100,11 +101,6 @@ class OmniSoC_Base:
        # assume no SoC specific options and return empty list by default
        return []

-    def check_arch_override(self):
-        if "OMNIPERF_ARCH_OVERRIDE" in os.environ.keys():
-            return os.environ["OMNIPERF_ARCH_OVERRIDE"]
-        return ""
-
    @demarcate
    def populate_mspec(self):
        from utils.specs import search, run, total_sqc, total_xcds
@@ -156,6 +152,11 @@ class OmniSoC_Base:
                self._mspec.workgroup_max_size = key
                continue

+            key = search(r"^\s*Chip ID:\s+ ([a-zA-Z0-9]+)\s*", linetext)
+            if key != None:
+                self._mspec.chip_id = key
+                continue
+
            key = search(r"^\s*Max Waves Per CU:\s+ ([a-zA-Z0-9]+)\s*", linetext)
            if key != None:
                self._mspec.max_waves_per_cu = key
@@ -181,28 +182,9 @@ class OmniSoC_Base:
            0
        ].upper()
        if self._mspec.gpu_model == "MI300":
-            self._mspec.gpu_model = list(SUPPORTED_ARCHS[self._mspec.gpu_arch].values())[
-                0
-            ][0]
-        if self._mspec.gpu_arch == "gfx942":
-            if (
-                "MI300A" in "\n".join(self._mspec._rocminfo)
-                or "MI300A" in self.check_arch_override()
-            ):
-                self._mspec.gpu_model = "MI300A_A1"
-            elif (
-                "MI300X" in "\n".join(self._mspec._rocminfo)
-                or "MI300X" in self.check_arch_override()
-            ):
-                self._mspec.gpu_model = "MI300X_A1"
-            # We need to distinguish MI308X by peeking reported num CUs
-            elif self._mspec.cu_per_gpu == "80" or "MI308X" in self.check_arch_override():
-                self._mspec.gpu_model = "MI308X"
-            else:
-                console_error(
-                    "Cannot parse MI300 details from rocminfo. Please verify output or set the arch using (e.g.,) "
-                    'export OMNIPERF_ARCH_OVERRIDE="MI300A"'
-                )
+            # Use Chip ID to distinguish MI300 gpu model using the built-in dictionary
+            if self._mspec.chip_id in MI300_CHIP_IDS:
+                self._mspec.chip_id = MI300_CHIP_IDS[self._mspec.chip_id]

        self._mspec.num_xcd = str(
            total_xcds(self._mspec.gpu_model, self._mspec.compute_partition)
@@ -403,6 +403,13 @@ class MachineSpecs:
            "name": "Workgroup Max Size",
        },
    )
+    chip_id: str = field(
+        default=None,
+        metadata={
+            "doc": "<>",
+            "name": "Chip ID",
+        },
+    )
    max_waves_per_cu: str = field(
        default=None,
        metadata={