From bfedf2629eae7abbcb7b9bd4841723b21f725636 Mon Sep 17 00:00:00 2001 From: Stephen Sachs Date: Wed, 16 Jul 2025 17:56:12 +0200 Subject: [PATCH] Add issues templates and Github action to remove stale issues We add 3 different issue types issue/question/RFE and add some predefined questions to speed up the debugging process. We also add a custom action which will close all issues create mode than 6 months ago which have not been updated for more than a month. --- .github/ISSUE_TEMPLATE/ISSUE.yaml | 77 +++++++++++++++++++++++++ .github/ISSUE_TEMPLATE/QUESTION.yaml | 15 +++++ .github/ISSUE_TEMPLATE/RFE.yaml | 22 +++++++ .github/ISSUE_TEMPLATE/config.yml | 1 + .github/workflows/close-old-issues.js | 76 ++++++++++++++++++++++++ .github/workflows/close_old_issues.yaml | 31 ++++++++++ 6 files changed, 222 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/ISSUE.yaml create mode 100644 .github/ISSUE_TEMPLATE/QUESTION.yaml create mode 100644 .github/ISSUE_TEMPLATE/RFE.yaml create mode 100644 .github/ISSUE_TEMPLATE/config.yml create mode 100644 .github/workflows/close-old-issues.js create mode 100644 .github/workflows/close_old_issues.yaml diff --git a/.github/ISSUE_TEMPLATE/ISSUE.yaml b/.github/ISSUE_TEMPLATE/ISSUE.yaml new file mode 100644 index 0000000000..f760b305be --- /dev/null +++ b/.github/ISSUE_TEMPLATE/ISSUE.yaml @@ -0,0 +1,77 @@ +name: NCCL issue or bug +description: Report an issue or failure when running NCCL code +title: "[Issue]: " +labels: ["triage"] + +body: + - type: markdown + attributes: + value: | + Thanks for reaching out! Before reporting a new issue, please feel free to search for the behavior in the existing issues. If you found an issue which is already closed or you are unsure, open a new issue and reference the old one from it. + You can also check out the [troubleshooting section](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/troubleshooting.html) in our user guide. + + --- + + To ensure we can assist you quickly and accurately, we often need the following information: + - type: dropdown + id: type + attributes: + label: How is this issue impacting you? + description: What best describes your issue? + options: + - Lower performance than expected + - Application crash + - Data corruption + - Application hang + validations: + required: true + + - type: textarea + id: log + attributes: + label: Share Your Debug Logs + description: | + + The logs and topo-files are a great tool to pin down issues. You can create them by setting these environment variables before the run. + * `NCCL_DEBUG=INFO` and `NCCL_DEBUG_FILE=ncclDebug.%h.%p` to produce one file per rank + * `NCCL_TOPO_DUMP_FILE=ncclSystem.txt` + + - type: textarea + id: repro + attributes: + label: Steps to Reproduce the Issue + description: | + * **Minimal Steps**: Please provide a simple way to recreate the issue (see [Minimal Bug Reports](https://matthewrocklin.com/minimal-bug-reports) for inspiration). + * **Environment Details**: Include software versions and relevant settings. + * **Intermittency**: Is this a sporadic issue? If so, how often does it occur? + * **Previous Success**: Did this work with an older NCCL version? + + The easier we can reproduce on our side the more likely we are to be able to solve it in a timely manner. + + - type: input + id: nccl_version + attributes: + label: NCCL Version + description: | + NCCL reports its version string in the debug logs. + You can also determine the version if you know which library was used by running `strings libnccl.so | grep 'NCCL version'`. + placeholder: "e.g. 2.27.1+cuda12.8" + validations: + required: true + + - type: textarea + id: platform + attributes: + label: Your platform details + description: | + * **GPU & Network**: Share your architecture and topology (e.g., from `nvidia-smi`, `nvidia-smi topo -m`, `ibstatus`). + * **Environment**: Bare-metal, containers, or cloud? + * **Scalability**: Does this issue occur with a specific number of ranks/nodes? + + - type: textarea + id: issue-description + attributes: + label: Error Message & Behavior + description: | + * **First Error**: What was the initial `NCCL WARN` message in your logs? + * **Expected vs. Actual**: Briefly describe the anticipated behavior versus what you're seeing. diff --git a/.github/ISSUE_TEMPLATE/QUESTION.yaml b/.github/ISSUE_TEMPLATE/QUESTION.yaml new file mode 100644 index 0000000000..60e43489f1 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/QUESTION.yaml @@ -0,0 +1,15 @@ +name: NCCL question +description: Ask the NCCL team a question +title: "[Question]: " +labels: ["question"] + +body: + - type: markdown + attributes: + value: | + Thanks for reaching out! To solve your problem, feel free to check out the [user guide](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/index.html), in particular the troubleshooting section, and also the [release notes](https://docs.nvidia.com/deeplearning/nccl/release-notes/index.html). + --- + - type: textarea + id: question + attributes: + label: Question diff --git a/.github/ISSUE_TEMPLATE/RFE.yaml b/.github/ISSUE_TEMPLATE/RFE.yaml new file mode 100644 index 0000000000..7a305abfa9 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/RFE.yaml @@ -0,0 +1,22 @@ +name: NCCL request for enhancement +description: Request for enhancement +title: "[RFE]: " +labels: ["enhancement"] +body: + - type: markdown + attributes: + value: | + + Thanks for your feedback! Before reporting a new RFE you could quickly check if this already exists in our [existing requests](https://github.com/NVIDIA/nccl/issues?q=sort%3Aupdated-desc%20is%3Aissue%20is%3Aopen%20label%3Aenhancement). + + --- + - type: textarea + id: rfe-description + attributes: + label: Please provide the below details to ensure we understand your needs + description: | + * What is the goal of this request? + * Who will benefit from this feature? + * Is this request for a specific GPU architecture or network infrastructure? + * How will this feature improve current workflows or processes? + * What is the priority level of this request? diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000000..3ba13e0cec --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1 @@ +blank_issues_enabled: false diff --git a/.github/workflows/close-old-issues.js b/.github/workflows/close-old-issues.js new file mode 100644 index 0000000000..9605aa023a --- /dev/null +++ b/.github/workflows/close-old-issues.js @@ -0,0 +1,76 @@ +const { Octokit } = require("@octokit/rest"); + +const octokit = new Octokit({ auth: process.env.GITHUB_TOKEN }); + +const owner = process.env.REPO_OWNER; +const repo = process.env.REPO_NAME.split('/').pop(); // Handles owner/repo format + +const now = new Date(); +const sixMonthsAgo = new Date(now); +sixMonthsAgo.setMonth(now.getMonth() - 6); +const oneMonthAgo = new Date(now); +oneMonthAgo.setMonth(now.getMonth() - 1); + +async function closeOldIssues() { + let page = 1; + let closedCount = 0; + + // write a multiline comment into a variable: + let body = `### Issue Cleanup: Helping Us Focus on Current Challenges + +We're [reviewing](https://github.com/NVIDIA/nccl/discussions/1761) older issues to ensure we prioritize the most relevant and active ones. Since this issue hasn't seen updates in over 6 months, we'll be closing it for now. + +*This change helps us focus our efforts on addressing any current issues our users are facing.* If this issue still affects you, please don't hesitate to reopen it with a quick update (e.g., \"Still relevant on [version=X]\"). +Thanks for your understanding and for contributing to NCCL.`; + + while (true) { + const { data: issues } = await octokit.issues.listForRepo({ + owner, + repo, + state: "open", + per_page: 100, + page, + }); + + if (issues.length === 0) break; + + for (const issue of issues) { + // Ignore PRs + if (issue.pull_request) continue; + + const createdAt = new Date(issue.created_at); + const updatedAt = new Date(issue.updated_at); + + if (createdAt < sixMonthsAgo && updatedAt < oneMonthAgo) { + + // Add a comment before closing + await octokit.issues.createComment({ + owner, + repo, + issue_number: issue.number, + body: body, + }); + + await octokit.issues.update({ + owner, + repo, + issue_number: issue.number, + state: "closed", + state_reason: "not_planned", + }); + closedCount++; + console.log(`Closed issue #${issue.number}`); + + // Break out if we have closed 100 issues + if (closedCount >= 100) { + console.log("Closed 100 issues, stopping."); + return; + } + } + } + page++; + } + console.log(`Total closed: ${closedCount}`); +} + +closeOldIssues().catch(console.error); diff --git a/.github/workflows/close_old_issues.yaml b/.github/workflows/close_old_issues.yaml new file mode 100644 index 0000000000..15d81cb547 --- /dev/null +++ b/.github/workflows/close_old_issues.yaml @@ -0,0 +1,31 @@ +name: Close Old Issues + +on: + schedule: + - cron: '30 2 * * *' # Runs daily at 02:30 UTC + workflow_dispatch: + +permissions: + issues: write + +jobs: + close-old-issues: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: 20 + + - name: Install dependencies + run: npm install @octokit/rest@22.0.0 + + - name: Run close-old-issues script + run: node .github/workflows/close-old-issues.js + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + REPO_OWNER: ${{ github.repository_owner }} + REPO_NAME: ${{ github.event.repository.name || github.repository }}