diff --git a/projects/rocm-core/rdhc/README.md b/projects/rocm-core/rdhc/README.md index 539e3a1351..e6a57ab54c 100644 --- a/projects/rocm-core/rdhc/README.md +++ b/projects/rocm-core/rdhc/README.md @@ -1,28 +1,64 @@ -# rdhc -Rocm Deployment Health Check Tool +# ROCm Deployment Health Check (RDHC) + +## Overview + +RDHC is a comprehensive health check tool for ROCm deployments. It validates GPU presence, driver status, kernel parameters, library dependencies, and tests installed ROCm components. -## Features of the ROCm Deployment Health Check Tool +## Features + +- **Cross-Platform Support**: Works on Ubuntu, RHEL, and SLES distributions +- **Comprehensive Testing**: GPU validation, driver checks, library dependencies, kernel parameters, and component-specific tests +- **Dynamic Component Detection**: Automatically identifies installed ROCm components +- **Flexible Reporting**: Pretty table output and JSON export options +- **Configurable Verbosity**: Support for verbose, normal, and silent modes + + +## Test Categories + +### Default Tests (Quick Mode) + +1. **GPU Presence** - Detects AMD GPUs in the system +2. **AMDGPU Driver** - Validates driver installation and initialization +3. **Kernel Parameters** - Checks ROCm-related kernel settings +4. **rocminfo** - Validates ROCm information utility +5. **rocm_agent_enumerator** - Checks GPU agent enumeration +6. **amd-smi** - Tests AMD System Management Interface +7. **Library Dependencies** - Validates ROCm library dependencies +8. **Environment Variables** - Checks ROCm-related environment settings +9. **Multinode Cluster Readiness** - Validates network and MPI configuration +10. **Atomic Operations** - Checks if atomic operations are enabled on GPUs + +### Component Tests (--all mode) + +Tests installed ROCm components by compiling and executing example programs: + +- HIP (hipcc, hip-runtime-amd) +- Math Libraries (hipBLAS, hipFFT, rocBLAS, rocFFT, etc.) +- Primitives (hipCUB, rocPRIM, rocThrust) +- Solvers (hipSOLVER, rocSOLVER, rocSPARSE) +- Deep Learning (MIOpen) +- Applications (from rocm-examples repository) + +## Output + +The tool provides three types of output: + +1. **Console Output** - Real-time test progress and results +2. **Summary Tables** - Formatted tables showing: + - General system information + - GPU device information + - Firmware version information + - Test results with status and details +3. **JSON Export** - Detailed results in JSON format for further analysis -1. **Cross-Platform Support**: Works on Ubuntu, RHEL, and SLES distributions -2. **Comprehensive Testing**: - - Default tests (GPU presence, driver status, rocminfo, rocm-smi) - - Library dependency verification - - Check some kernel parameters and ENV variables presence - - Component-specific tests - - Build and test the test program available from rocm-examples git repo dynamically. -3. **Dynamic Component Detection**: Identifies installed ROCm components using distribution-specific package manager commands -4. **Flexible Reporting**: - - Pretty table output for terminal display - - JSON export for further analysis or integration -5. **Configurable Verbosity**: Through command-line options (`-v` for verbose, `-s` for silent) ## Install dependency pip packages ```bash sudo pip3 install -r requirements.txt - ``` + ## Usage ```bash @@ -61,12 +97,12 @@ sudo -E ./rdhc.py --all --json rdhc-results.json # Specify a directory for temp files and logs (default: /tmp/rdhc/) sudo -E ./rdhc.py -d /home/user/rdhc-dir/ - ``` + ## RDHC Environment VARIABLES -RDHC tool will use the following ENV varaibles and act accordingly if they are set. +RDHC tool will use the following ENV variables and act accordingly if they are set. ```bash -# ROCm installation path can be set by the below ENV varaible. Default is "/opt/rocm/" +# ROCm installation path can be set by the below ENV variable. Default is "/opt/rocm/" export ROCM_PATH="/opt/rocm" # For library dependency validation, the lib search depth can be set by the below ENV. @@ -75,7 +111,41 @@ export LIBDIR_MAX_DEPTH="" # if you want to check the libs only from the ROCM_PATH/lib/ folder set the depth as 1. export LIBDIR_MAX_DEPTH=1 - ``` -The tool is designed to be easily extended with additional component tests by -adding new test methods following the naming convention `test_check_component_name()`. + +## Troubleshooting + +### Python Package Installation Issues (Ubuntu 24.04) + +If `sudo pip3 install` fails with an "externally-managed-environment" error (common in Ubuntu 24.04), use a Python virtual environment instead: + +```bash +# Create a virtual environment (one-time setup) +python3 -m venv ~/rdhc-venv + +# Activate the virtual environment +source ~/rdhc-venv/bin/activate + +# Install required packages +pip3 install -r requirements.txt +``` + +**Note for Ubuntu 24.04 users:** Due to enhanced security policies, `sudo -E` does not preserve the virtual environment PATH. Replace all `sudo -E` commands with `sudo --preserve-env=PATH` in the usage examples above. + +For example: +```bash +# Instead of: sudo -E ./rdhc.py +# Use: +source ~/rdhc-venv/bin/activate +sudo --preserve-env=PATH ./rdhc.py + +# Run all tests +sudo --preserve-env=PATH ./rdhc.py --all + +# Run with verbose output +sudo --preserve-env=PATH ./rdhc.py -v +``` + +--- + +The tool is designed to be easily extended with additional component tests by adding new test methods following the naming convention `test_check_component_name()`. diff --git a/projects/rocm-core/rdhc/rdhc.py b/projects/rocm-core/rdhc/rdhc.py index 39163e15c3..c87e5add77 100755 --- a/projects/rocm-core/rdhc/rdhc.py +++ b/projects/rocm-core/rdhc/rdhc.py @@ -476,8 +476,7 @@ class ROCMHealthCheck: # AMD GPUs PCI class codes: 03xx (Display controllers ), 12xx (Processing accelerators) # use class codes also to identify AMD GPUs - stdout, _, ret_code = run_command( "lspci -d 1002: -nn | grep -Ei \ - 'Display controller|Processing accelerators|\[03[[:xdigit:]]{2}\]|\[12[[:xdigit:]]{2}\]' ",\ + stdout, _, ret_code = run_command( r"lspci -d 1002: -nn | grep -Ei 'Display controller|Processing accelerators|\[03[[:xdigit:]]{2}\]|\[12[[:xdigit:]]{2}\]' ",\ shell=True) gpu_hw = stdout.strip() if ret_code == 0 and gpu_hw: @@ -1254,7 +1253,7 @@ class ROCMHealthCheck: # 2. Check if network cards (NICs) are present in hardware list self.logger.info("----Checking for network interface cards...") nic_brand = None - nic_cards, stderr, ret_code = run_command("lspci -nn | grep -i 'ethernet\|network\|infiniband'", shell=True) + nic_cards, stderr, ret_code = run_command("lspci -nn | grep -Ei 'ethernet|network|infiniband'", shell=True) if ret_code != 0 or not nic_cards.strip(): errors += 1 cluster_readiness_issues.append("No network cards found in hardware") @@ -1348,6 +1347,137 @@ class ROCMHealthCheck: self.logger.info(f" {success_msg}") return TestStatus.PASS.value, success_msg + def test_check_atomic_operations(self): + """Test if atomic operations are enabled for GPU devices""" + self.logger.info("--Checking atomic operations support for GPU devices...") + + # Find AMD GPU devices using lspci + stdout, stderr, ret_code = run_command("lspci -d 1002: -nn | grep -Ei 'Display controller|Processing accelerators|VGA compatible controller'", shell=True) + + if ret_code != 0 or not stdout.strip(): + self.logger.error("!!! No AMD GPU devices found") + return TestStatus.FAIL.value, "No AMD GPU devices found to check atomic operations." + + gpu_devices = stdout.strip().split('\n') + self.logger.info(f"----Found {len(gpu_devices)} AMD GPU device(s)") + + def parse_atomic_details(stdout_detail, pci_address): + """Parse atomic operations details from lspci output""" + atomic_cap_found = False + atomic_enabled = False + + for line in stdout_detail.strip().split('\n'): + line = line.strip() + + if "AtomicOpsCap:" in line: + atomic_cap_found = True + self.logger.debug(f"------Device {pci_address}: {line}") + + if "AtomicOpsCtl:" in line: + # Check if ReqEn+ (Request Enable is set) + if "ReqEn+" in line: + atomic_enabled = True + self.logger.debug(f"------Device {pci_address}: {line}") + + return atomic_cap_found, atomic_enabled + + def check_device_atomic_ops(gpu_line): + """Check atomic operations for a single GPU device""" + # Extract PCI address using regex (e.g., "01:00.0") + pci_match = re.match(r'^([0-9a-f]{2}:[0-9a-f]{2}\.[0-9a-f])', gpu_line.strip()) + + if not pci_match: + self.logger.warning(f"!!! Could not extract PCI address from line: {gpu_line}") + return None, "check_failed", f"Invalid format: Could not extract PCI address" + + pci_address = pci_match.group(1) + + # Get atomic operations info using grep to filter relevant lines + stdout_detail, stderr_detail, ret_detail = run_command( + f"lspci -vvv -s {pci_address} | grep -i atomic", + shell=True + ) + + if ret_detail != 0 or not stdout_detail.strip(): + self.logger.warning(f"!!! Failed to get atomic operations info for device {pci_address}") + self.logger.warning(f"!!! Try running the test with 'sudo -E' ") + return pci_address, "check_failed", f"{pci_address}: Check failed" + + # Parse AtomicOpsCap and AtomicOpsCtl + atomic_cap_found, atomic_enabled = parse_atomic_details(stdout_detail, pci_address) + + # Determine device status + if atomic_cap_found and atomic_enabled: + status_msg = f"{pci_address}: Supported and Enabled" + self.logger.info(f"------{status_msg}") + return pci_address, "enabled", status_msg + elif atomic_cap_found and not atomic_enabled: + status_msg = f"{pci_address}: Supported but NOT Enabled (ReqEn-)" + self.logger.warning(f"!!! {status_msg}") + return pci_address, "disabled", status_msg + else: + status_msg = f"{pci_address}: Capability not found or unclear" + self.logger.warning(f"!!! {status_msg}") + return pci_address, "check_failed", status_msg + + def check_pcie_atomic_routing_capability(pci_address): + """Check PCIe generation and lane configuration for atomic routing""" + + stdout, stderr, ret_code = run_command( + f"lspci -vvv -s {pci_address} | grep -E 'LnkCap:|LnkSta:'", + shell=True + ) + + if ret_code == 0 and stdout.strip(): + self.logger.debug(f"------PCIe Link Capabilities for {pci_address}:") + for line in stdout.strip().split('\n'): + self.logger.info(f"--------{line.strip()}") + + # Check for PCIe Gen4/Gen5 which have better atomic support + if "LnkSta" in line and "Speed" in line: + if "16GT/s" in line: # PCIe Gen4 + self.logger.info(f"------Device {pci_address}: PCIe Gen4 (16GT/s) - Good atomic routing capability") + elif "32GT/s" in line: # PCIe Gen5 + self.logger.info(f"------Device {pci_address}: PCIe Gen5 (32GT/s) - Excellent atomic routing capability") + elif "8GT/s" in line: # PCIe Gen3 + self.logger.warning(f"!!! Device {pci_address}: PCIe Gen3 (8GT/s) - Limited atomic routing capability") + + + atomic_ops_status = [] + devices_with_atomics = 0 + devices_without_atomics = 0 + check_failed_devices = 0 + + # Check atomic operations for each GPU device + for gpu_line in gpu_devices: + pci_address, status_type, status_msg = check_device_atomic_ops(gpu_line) + atomic_ops_status.append(status_msg) + if pci_address is not None: + check_pcie_atomic_routing_capability(pci_address) + + if status_type == "enabled": + devices_with_atomics += 1 + elif status_type == "disabled": + devices_without_atomics += 1 + else: # check_failed + check_failed_devices += 1 + + # Log summary + self.logger.info(f"----Atomic operations summary:") + self.logger.info(f"------Devices with atomic ops enabled: {devices_with_atomics}") + self.logger.info(f"------Devices with atomic ops disabled: {devices_without_atomics}") + if check_failed_devices > 0: + self.logger.info(f"------Devices with check failed/unclear: {check_failed_devices}") + + # Determine overall status + if devices_without_atomics > 0: + return TestStatus.FAIL.value, f"Atomic operations not enabled for {devices_without_atomics} device(s). Details: {'; '.join(atomic_ops_status)}" + elif check_failed_devices > 0: + return TestStatus.FAIL.value, f"Atomic operations check completed with {check_failed_devices} warning(s). Details: {'; '.join(atomic_ops_status)}" + else: + return TestStatus.PASS.value, f"Atomic operations supported and enabled on all {devices_with_atomics} GPU device(s)." + + # Example component specific tests (these should be customized for each component) def test_check_hipcc(self): """Test hipcc package""" @@ -1566,54 +1696,70 @@ class ROCMHealthCheck: # TODO return TestStatus.PASS.value, f"{component} is installed but no specific test available." + def _print_test_start(self, test_name): + """Print a separator line and test start message + + Args: + test_name (str): Name of the test being run + """ + separator = "=" * 80 + print(f"\n{separator}") + self.logger.info(f"Running test: {test_name}...") + def run_default_tests(self): """Run the default set of tests""" results = {} # Test 1: GPU Presence - self.logger.info("Running test: GPU Presence...") + self._print_test_start("GPU Presence") status, reason = self.test_GPUPresence() results["gpu_presence"] = {"status": status, "reason": reason} # Test 2: AMDGPU Driver - self.logger.info("Running test: AMDGPU Driver...") + self._print_test_start("AMDGPU Driver") status, reason = self.test_amdgpu_driver() results["amdgpu_driver"] = {"status": status, "reason": reason} # Test 3: Kernel Parameters - self.logger.info("Running test: Kernel Parameters...") + self._print_test_start("Kernel Parameters") status, reason = self.test_check_kernel_parameters() results["kernel_parameters"] = {"status": status, "reason": reason} # Test 4: rocminfo - self.logger.info("Running test: rocminfo...") + self._print_test_start("rocminfo") status, reason = self.test_rocminfo() results["rocminfo"] = {"status": status, "reason": reason} # Test 5: rocm_agent_enumerator - self.logger.info("Running test: rocm_agent_enumerator...") + self._print_test_start("rocm_agent_enumerator") status, reason = self.test_rocm_agent_enumerator() results["rocm_agent_enumerator"] = {"status": status, "reason": reason} # Test 6: amd-smi - self.logger.info("Running test: amd-smi...") + self._print_test_start("amd-smi") status, reason = self.test_amd_smi() results["amd_smi"] = {"status": status, "reason": reason} # Test 7: Library Dependencies - self.logger.info("Running test: Library Dependencies...") + self._print_test_start("Library Dependencies") status, reason = self.test_check_lib_dependencies() results["lib_dependencies"] = {"status": status, "reason": reason} # Test 8: Environment Variables - self.logger.info("Running test: ENV variables...") + self._print_test_start("ENV variables") status, reason = self.test_check_env_variables() results["env_variables"] = {"status": status, "reason": reason} # Test 9: Multinode cluster readiness - self.logger.info("Running test: Multinode cluster readiness...") + self._print_test_start("Multinode cluster readiness") status, reason = self.test_check_multinode_cluster_readiness() results["Multinode_Readiness"] = {"status": status, "reason": reason} + + # Test 10: Atomic Operations + self._print_test_start("Is Atomic Operations Enabled") + status, reason = self.test_check_atomic_operations() + results["atomic_operations"] = {"status": status, "reason": reason} + return results def run_component_tests(self): @@ -1622,7 +1768,7 @@ class ROCMHealthCheck: for component in self.installed_components: if component not in self.exclude_list: - self.logger.info(f"Running component test: {component}...") + self._print_test_start(f"Component - {component}") status, reason = self.test_component(component) results[component] = {"status": status, "reason": reason} @@ -1638,7 +1784,7 @@ class ROCMHealthCheck: # Run tests for each application target for target in self.rocm_examples_targets.get("applications", []): - self.logger.info(f"Running application test: [ {target} ]...") + self._print_test_start(f"Application - {target}") status, reason = self._build_target_and_run(target, target) results[target] = {"status": status, "reason": reason} @@ -1817,6 +1963,17 @@ def main(): "\n"+ "# Specify a directory for temp files and logs (default: /tmp/rdhc/)\n" + "sudo -E ./rdhc.py -d /home/user/rdhc-dir/\n" + + "\n"+ + "NOTE for Ubuntu 24.04 (Python 3.12) users:\n" + + "Due to enhanced security policies, you must use a virtual environment:\n" + + " # Create and activate virtual environment (one-time setup)\n" + + " python3 -m venv ~/rdhc-venv\n" + + " source ~/rdhc-venv/bin/activate\n" + + " pip3 install -r requirements.txt\n" + + "\n" + + " # Run the tool (use --preserve-env=PATH instead of -E)\n" + + " sudo --preserve-env=PATH ./rdhc.py\n" + + " sudo --preserve-env=PATH ./rdhc.py --all\n" + " ", )