From 7c7705d2baaec54027f36490ca4578fec9e2bdf4 Mon Sep 17 00:00:00 2001 From: David Galiffi Date: Mon, 12 Aug 2024 10:32:33 -0400 Subject: [PATCH] Remove `dev` and `main` branch from workflows. (#404) * Remove `dev` and `main` branch from workflows. Update links in documentation. Signed-off-by: David Galiffi * `amd-staging` -> `amd-mainline` in docs Signed-off-by: Peter Jun Park --------- Signed-off-by: David Galiffi Signed-off-by: Peter Jun Park Co-authored-by: Peter Jun Park --- .github/workflows/docs.yml | 2 +- .github/workflows/formatting.yml | 4 ++-- .github/workflows/mi-rhel9.yml | 2 +- .github/workflows/rhel-8.yml | 4 ++-- .github/workflows/tarball.yml | 2 +- .github/workflows/ubuntu-jammy.yml | 4 ++-- docs/archive/docs-1.x/analysis.md | 2 +- docs/archive/docs-2.x/analysis.md | 2 +- docs/archive/docs-2.x/performance_model.md | 16 ++++++++-------- docs/conf.py | 2 +- docs/how-to/analyze/cli.rst | 2 +- docs/how-to/profile/mode.rst | 2 +- .../vector-memory-operation-counting.rst | 2 +- docs/tutorial/profiling-by-example.rst | 2 +- 14 files changed, 24 insertions(+), 24 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 81a46cf8b0..f6dcbd9a04 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -2,7 +2,7 @@ name: Documentation on: push: - branches: [ main, amd-mainline ] + branches: [ amd-mainline ] paths: - 'docs/archive/docs-2.x/**' - 'docs/archive/docs-1.x/**' diff --git a/.github/workflows/formatting.yml b/.github/workflows/formatting.yml index 8f4bda4453..9d9e125f04 100644 --- a/.github/workflows/formatting.yml +++ b/.github/workflows/formatting.yml @@ -3,9 +3,9 @@ name: Formatting on: push: - branches: [ main, dev, amd-mainline, amd-staging ] + branches: [ amd-mainline, amd-staging ] pull_request: - branches: [ main, dev, amd-mainline, amd-staging ] + branches: [ amd-mainline, amd-staging ] concurrency: group: ${{ github.workflow }}-${{ github.ref }} diff --git a/.github/workflows/mi-rhel9.yml b/.github/workflows/mi-rhel9.yml index a500d2bc02..157241521f 100644 --- a/.github/workflows/mi-rhel9.yml +++ b/.github/workflows/mi-rhel9.yml @@ -2,7 +2,7 @@ name: mi-rhel9 on: push: - branches: [ main, amd-mainline ] + branches: [ amd-mainline ] # Allows manual execution workflow_dispatch: diff --git a/.github/workflows/rhel-8.yml b/.github/workflows/rhel-8.yml index 278fbaa5f2..59ba244186 100644 --- a/.github/workflows/rhel-8.yml +++ b/.github/workflows/rhel-8.yml @@ -5,9 +5,9 @@ name: RHEL 8 # Controls when the workflow will run on: push: - branches: [ main, dev, amd-mainline, amd-staging ] + branches: [ amd-mainline, amd-staging ] pull_request: - branches: [ main, dev, amd-mainline, amd-staging ] + branches: [ amd-mainline, amd-staging ] # Allows you to run this workflow manually from the Actions tab workflow_dispatch: diff --git a/.github/workflows/tarball.yml b/.github/workflows/tarball.yml index 8958c076d7..1724a01f96 100644 --- a/.github/workflows/tarball.yml +++ b/.github/workflows/tarball.yml @@ -2,7 +2,7 @@ name: tarball on: push: - branches: [ main, amd-mainline ] + branches: [ amd-mainline ] pull_request: concurrency: diff --git a/.github/workflows/ubuntu-jammy.yml b/.github/workflows/ubuntu-jammy.yml index cb76dd4977..cf29d809d5 100644 --- a/.github/workflows/ubuntu-jammy.yml +++ b/.github/workflows/ubuntu-jammy.yml @@ -4,9 +4,9 @@ name: Ubuntu 22.04 on: push: - branches: [ main, dev, amd-mainline, amd-staging ] + branches: [ amd-mainline, amd-staging ] pull_request: - branches: [ main, dev, amd-mainline, amd-staging ] + branches: [ amd-mainline, amd-staging ] # Allows you to run this workflow manually from the Actions tab workflow_dispatch: diff --git a/docs/archive/docs-1.x/analysis.md b/docs/archive/docs-1.x/analysis.md index e8e18fe906..126364dbd6 100644 --- a/docs/archive/docs-1.x/analysis.md +++ b/docs/archive/docs-1.x/analysis.md @@ -171,7 +171,7 @@ $ omniperf analyze -p workloads/vcopy/mi200/ --list-metrics gfx90a ├─────────┼─────────────────────────────┤ ... ``` - 2. Choose your own customized subset of metrics with `-b` (a.k.a. `--metric`), or build your own config following [config_template](https://github.com/ROCm/omniperf/blob/main/src/omniperf_analyze/configs/panel_config_template.yaml). Below shows how to generate a report containing only metric 2 (a.k.a. System Speed-of-Light). + 2. Choose your own customized subset of metrics with `-b` (a.k.a. `--metric`), or build your own config following [config_template](https://github.com/ROCm/omniperf/blob/amd-mainline/src/omniperf_analyze/configs/panel_config_template.yaml). Below shows how to generate a report containing only metric 2 (a.k.a. System Speed-of-Light). ```shell-session $ omniperf analyze -p workloads/vcopy/mi200/ -b 2 -------- diff --git a/docs/archive/docs-2.x/analysis.md b/docs/archive/docs-2.x/analysis.md index 471fc31725..22fa72e392 100644 --- a/docs/archive/docs-2.x/analysis.md +++ b/docs/archive/docs-2.x/analysis.md @@ -181,7 +181,7 @@ Analysis mode = cli 2.1.30 -> L1I Fetch Latency ... ``` -3. Choose your own customized subset of metrics with `-b` (a.k.a. `--block`), or build your own config following [config_template](https://github.com/ROCm/omniperf/blob/main/src/omniperf_analyze/configs/panel_config_template.yaml). Below shows how to generate a report containing only metric 2 (a.k.a. System Speed-of-Light). +3. Choose your own customized subset of metrics with `-b` (a.k.a. `--block`), or build your own config following [config_template](https://github.com/ROCm/omniperf/blob/amd-mainline/src/omniperf_analyze/configs/panel_config_template.yaml). Below shows how to generate a report containing only metric 2 (a.k.a. System Speed-of-Light). ```shell-session $ omniperf analyze -p workloads/vcopy/MI200/ -b 2 -------- diff --git a/docs/archive/docs-2.x/performance_model.md b/docs/archive/docs-2.x/performance_model.md index 1f564084fc..df761cf0df 100644 --- a/docs/archive/docs-2.x/performance_model.md +++ b/docs/archive/docs-2.x/performance_model.md @@ -2178,7 +2178,7 @@ A good discussion of coarse and fine grained memory allocations and what type of (VALU_inst_mix_example)= ## VALU Arithmetic Instruction Mix -For this example, we consider the [instruction mix sample](https://github.com/ROCm/omniperf/blob/dev/sample/instmix.hip) distributed as a part of Omniperf. +For this example, we consider the [instruction mix sample](https://github.com/ROCm/omniperf/blob/amd-mainline/sample/instmix.hip) distributed as a part of Omniperf. ```{note} This example is expected to work on all CDNA accelerators, however the results in this section were collected on an [MI2XX](2xxnote) accelerator @@ -2269,7 +2269,7 @@ shows that we have exactly one of each type of VALU arithmetic instruction, by c (Fabric_transactions_example)= ## Infinity-Fabric(tm) transactions -For this example, we consider the [Infinity Fabric(tm) sample](https://github.com/ROCm/omniperf/blob/dev/sample/fabric.hip) distributed as a part of Omniperf. +For this example, we consider the [Infinity Fabric(tm) sample](https://github.com/ROCm/omniperf/blob/amd-mainline/sample/fabric.hip) distributed as a part of Omniperf. This code launches a simple read-only kernel, e.g.: ```c++ @@ -2826,7 +2826,7 @@ On an AMD [MI2XX](2xxnote) accelerator, for FP32 values this will generate a `gl (flatmembench)= ### Global / Generic (FLAT) -For this example, we consider the [vector-memory sample](https://github.com/ROCm/omniperf/blob/dev/sample/vmem.hip) distributed as a part of Omniperf. +For this example, we consider the [vector-memory sample](https://github.com/ROCm/omniperf/blob/amd-mainline/sample/vmem.hip) distributed as a part of Omniperf. This code launches many different versions of a simple read/write/atomic-only kernels targeting various address spaces, e.g. below is our simple `global_write` kernel: ```c++ @@ -2976,7 +2976,7 @@ The assembly in these experiments were generated for an [MI2XX](2xxnote) acceler Next, we examine a generic write. As discussed [previously](Flat_design), our `generic_write` kernel uses an address space cast to _force_ the compiler to choose our desired address space, regardless of other optimizations that may be possible. -We also note that the `filter` parameter passed in as a kernel argument (see [example](https://github.com/ROCm/omniperf/blob/dev/sample/vmem.hip), or [design note](Flat_design)) is set to zero on the host, such that we always write to the 'local' (LDS) memory allocation `lds`. +We also note that the `filter` parameter passed in as a kernel argument (see [example](https://github.com/ROCm/omniperf/blob/amd-mainline/sample/vmem.hip), or [design note](Flat_design)) is set to zero on the host, such that we always write to the 'local' (LDS) memory allocation `lds`. Examining this kernel in the VMEM Instruction Mix table yields: @@ -3339,7 +3339,7 @@ Next we examine the use of 'Spill/Scratch' memory. On current CDNA accelerators such as the [MI2XX](2xxnote), this is implemented using the [private](mspace) memory space, which maps to ['scratch' memory](https://llvm.org/docs/AMDGPUUsage.html#amdgpu-address-spaces) in AMDGPU hardware terminology. This type of memory can be accessed via different instructions depending on the specific architecture targeted. However, current CDNA accelerators such as the [MI2XX](2xxnote) use so called `buffer` instructions to access private memory in a simple (and typically) coalesced manner. See [Sec. 9.1, 'Vector Memory Buffer Instructions' of the CDNA2 ISA guide](https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf) for further reading on this instruction type. -We develop a [simple kernel](https://github.com/ROCm/omniperf/blob/dev/sample/stack.hip) that uses stack memory: +We develop a [simple kernel](https://github.com/ROCm/omniperf/blob/amd-mainline/sample/stack.hip) that uses stack memory: ```c++ #include __global__ void knl(int* out, int filter) { @@ -3404,7 +3404,7 @@ Here we see a single write to the stack (10.3.6), which corresponds to an L1-L2 (IPC_example)= ## Instructions-per-cycle and Utilizations example -For this section, we use the instructions-per-cycle (IPC) [example](https://github.com/ROCm/omniperf/blob/dev/sample/ipc.hip) included with Omniperf. +For this section, we use the instructions-per-cycle (IPC) [example](https://github.com/ROCm/omniperf/blob/amd-mainline/sample/ipc.hip) included with Omniperf. This example is compiled using `c++17` support: @@ -3824,7 +3824,7 @@ Finally, we note that our branch utilization (11.2.5) has increased slightly fro ## LDS Examples -For this example, we consider the [LDS sample](https://github.com/ROCm/omniperf/blob/dev/sample/lds.hip) distributed as a part of Omniperf. +For this example, we consider the [LDS sample](https://github.com/ROCm/omniperf/blob/amd-mainline/sample/lds.hip) distributed as a part of Omniperf. This code contains two kernels to explore how both [LDS](lds) bandwidth and bank conflicts are calculated in Omniperf. This example was compiled and run on an MI250 accelerator using ROCm v5.6.0, and Omniperf v2.0.0. @@ -4037,7 +4037,7 @@ The bank conflict rate linearly increases with the number of work-items within a ## Occupancy Limiters Example -In this [example](https://github.com/ROCm/omniperf/blob/dev/sample/occupancy.hip), we will investigate the use of the resource allocation panel in the [Workgroup Manager](SPI)'s metrics section to determine occupancy limiters. +In this [example](https://github.com/ROCm/omniperf/blob/amd-mainline/sample/occupancy.hip), we will investigate the use of the resource allocation panel in the [Workgroup Manager](SPI)'s metrics section to determine occupancy limiters. This code contains several kernels to explore how both various kernel resources impact achieved occupancy, and how this is reported in Omniperf. This example was compiled and run on a MI250 accelerator using ROCm v5.6.0, and Omniperf v2.0.0: diff --git a/docs/conf.py b/docs/conf.py index b38ce2e5cf..f74f95ecd2 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -55,7 +55,7 @@ external_projects_current_project = "omniperf" # frequently used external resources extlinks = { - "dev-sample": ("https://github.com/ROCm/omniperf/blob/dev/sample/%s", "%s"), + "dev-sample": ("https://github.com/ROCm/omniperf/blob/amd-mainline/sample/%s", "%s"), "prod-page": ( "https://www.amd.com/en/products/accelerators/instinct/%s.html", "%s", diff --git a/docs/how-to/analyze/cli.rst b/docs/how-to/analyze/cli.rst index 15faff8fc1..82185cd417 100644 --- a/docs/how-to/analyze/cli.rst +++ b/docs/how-to/analyze/cli.rst @@ -186,7 +186,7 @@ Walkthrough 3. Choose your own customized subset of metrics with the ``-b`` (or ``--block``) option. Or, build your own configuration following - `config_template `_. + `config_template `_. The following snippet shows how to generate a report containing only metric 2 (:doc:`System Speed-of-Light `). diff --git a/docs/how-to/profile/mode.rst b/docs/how-to/profile/mode.rst index de23a801ba..5bc0ad6a7c 100644 --- a/docs/how-to/profile/mode.rst +++ b/docs/how-to/profile/mode.rst @@ -38,7 +38,7 @@ Run ``omniperf profile -h`` for more details. See Profiling example ----------------- -The ``__ repository +The ``__ repository includes source code for a sample GPU compute workload, ``vcopy.cpp``. A copy of this file is available in the ``share/sample`` subdirectory after a normal Omniperf installation, or via the ``$OMNIPERF_SHARE/sample`` directory when diff --git a/docs/tutorial/includes/vector-memory-operation-counting.rst b/docs/tutorial/includes/vector-memory-operation-counting.rst index e3dd0deb4a..2797ed8f26 100644 --- a/docs/tutorial/includes/vector-memory-operation-counting.rst +++ b/docs/tutorial/includes/vector-memory-operation-counting.rst @@ -623,7 +623,7 @@ manner. See for further reading on this instruction type. We develop a `simple -kernel `__ +kernel `__ that uses stack memory: .. code-block:: cpp diff --git a/docs/tutorial/profiling-by-example.rst b/docs/tutorial/profiling-by-example.rst index 8a9c85c03b..e39239b9d1 100644 --- a/docs/tutorial/profiling-by-example.rst +++ b/docs/tutorial/profiling-by-example.rst @@ -7,7 +7,7 @@ Profiling by example ******************** The following examples refer to sample :doc:`HIP ` code located in -:fab:`github` :dev-sample:`ROCm/omniperf/blob/dev/sample <>` and distributed +:fab:`github` :dev-sample:`ROCm/omniperf/blob/amd-mainline/sample <>` and distributed as part of Omniperf. .. include:: ./includes/valu-arithmetic-instruction-mix.rst