Merge pull request #670 from dougsland/gpudetector

Add ramalama gpu_detector
containers · Feb 1, 2025 · 4b34290 · 4b34290
2 parents 38974ff + 8d27050
commit 4b34290
Show file tree

Hide file tree

Showing 2 changed files with 286 additions and 0 deletions.
diff --git a/ramalama/cli.py b/ramalama/cli.py
@@ -4,6 +4,7 @@
 import json
 import os
 import subprocess
+import platform
 import time
 import ramalama.oci
 
@@ -21,6 +22,7 @@
 from ramalama.shortnames import Shortnames
 from ramalama.toml_parser import TOMLParser
 from ramalama.version import version, print_version
+from ramalama.gpu_detector import GPUDetector
 
 shortnames = Shortnames()
 
@@ -239,6 +241,58 @@ def configure_subcommands(parser):
     version_parser(subparsers)
 
 
+def show_gpus_available_cli(args):
+    """Detect and return available GPUs, with macOS support."""
+    gpu_detector = GPUDetector()
+    gpu_info = []
+    errors = []
+
+    system = platform.system()
+
+    if system == "Darwin":  # macOS GPU detection
+        try:
+            macos_gpus = gpu_detector.get_macos_gpu()
+            if macos_gpus:
+                gpu_info.extend(macos_gpus)
+            else:
+                errors.append({"Vendor": "Apple", "INFO": "No GPU detected on macOS."})
+        except Exception as e:
+            errors.append({"Vendor": "Apple", "INFO": str(e)})
+
+    else:  # Linux/Other OS GPU detection
+        try:
+            nvidia_gpus = gpu_detector.get_nvidia_gpu()
+            if nvidia_gpus:
+                gpu_info.extend(nvidia_gpus)
+            else:
+                errors.append({"Vendor": "NVIDIA", "INFO": "No NVIDIA GPU detected or drivers missing."})
+        except Exception as e:
+            errors.append({"Vendor": "NVIDIA", "INFO": str(e)})
+
+        try:
+            amd_gpus = gpu_detector.get_amd_gpu()
+            if amd_gpus:
+                gpu_info.extend(amd_gpus)
+            else:
+                errors.append({"Vendor": "AMD", "INFO": "No AMD GPU detected or drivers missing."})
+        except Exception as e:
+            errors.append({"Vendor": "AMD", "INFO": str(e)})
+
+        try:
+            intel_gpus = gpu_detector.get_intel_gpu()
+            if intel_gpus:
+                gpu_info.extend(intel_gpus)
+            else:
+                errors.append({"Vendor": "Intel", "INFO": "No Intel GPU detected or drivers missing."})
+        except Exception as e:
+            errors.append({"Vendor": "Intel", "INFO": str(e)})
+
+    return {
+        "Detected GPUs": gpu_info if gpu_info else [{"GPU": "None", "VRAM": "N/A", "INFO": "No GPUs detected"}],
+        "INFO": errors if errors else "No errors"
+    }
+
+
 def parse_arguments(parser):
     """Parse command line arguments."""
     return parser.parse_args()
@@ -508,6 +562,8 @@ def info_cli(args):
     if args.engine and len(args.engine) > 0:
         info["Engine"]["Info"] = engine_info(args)
 
+    gpu_info = show_gpus_available_cli(args)
+    info["GPUs"] = gpu_info
     print(json.dumps(info, sort_keys=True, indent=4))
 
 

diff --git a/ramalama/gpu_detector.py b/ramalama/gpu_detector.py
@@ -0,0 +1,230 @@
+"""
+MIT License
+
+(C) 2024-2025 ramalama developers
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+"""
+
+import subprocess
+import glob
+import platform
+import logging
+
+logging.basicConfig(
+    level=logging.WARNING,
+    format="%(asctime)s - %(levelname)s - %(message)s"
+)
+
+
+class GPUDetector:
+    def __init__(self):
+        self.best_gpu = None
+        self.best_vram = 0
+        self.best_env = None
+
+    def _update_best_gpu(self, memory_mib, gpu_name, env_var):
+        """Updates the best available GPU based on highest VRAM."""
+        if memory_mib > 1024 and memory_mib > self.best_vram:
+            self.best_vram = memory_mib
+            self.best_gpu = gpu_name
+            self.best_env = env_var
+
+    def get_nvidia_gpu(self):
+        """Detects Nvidia GPUs using nvidia-smi (Linux only)."""
+        if platform.system() != "Linux":
+            return  # Skip on macOS and other platforms
+
+        try:
+            result = subprocess.run(
+                ["nvidia-smi", "--query-gpu=index,memory.total", "--format=csv,noheader,nounits"],
+                capture_output=True, text=True, check=True
+            )
+            output = result.stdout.strip()
+
+            for line in output.split('\n'):
+                try:
+                    index, memory_mib = line.split(',')
+                    memory_mib = int(memory_mib.strip())
+                    self._update_best_gpu(memory_mib, index.strip(), "CUDA_VISIBLE_DEVICES")
+                except ValueError:
+                    raise RuntimeError(f"Error parsing Nvidia GPU info: {line}")
+
+        except FileNotFoundError:
+            raise RuntimeError("`nvidia-smi` not found. No NVIDIA GPU detected or drivers missing.")
+        except subprocess.CalledProcessError as e:
+            error_msg = e.stderr.strip() if e.stderr else "Unknown error (check if NVIDIA drivers are loaded)."
+            raise RuntimeError(f"Unable to detect NVIDIA GPU(s). Error: {error_msg}")
+
+    def get_amd_gpu(self):
+        """Detects AMD GPUs using sysfs on Linux or system_profiler on macOS."""
+        if platform.system() == "Linux":
+            self._read_gpu_memory('/sys/bus/pci/devices/*/mem_info_vram_total', "AMD GPU", "HIP_VISIBLE_DEVICES")
+        elif platform.system() == "Darwin":  # macOS
+            self.get_macos_gpu()  # macOS detection covers AMD GPUs
+
+    def _read_gpu_memory(self, path_pattern, gpu_name, env_var):
+        """Helper function to read GPU VRAM from `/sys/class/drm/`."""
+        try:
+            mem_files = glob.glob(path_pattern)
+            for mem_file in mem_files:
+                with open(mem_file, "r") as f:
+                    vram_total = int(f.read().strip()) // (1024 * 1024)  # Convert bytes to MiB
+                    return {"GPU": gpu_name, "VRAM": f"{vram_total} MiB", "Env": env_var}
+        except Exception as e:
+            return {"GPU": gpu_name, "VRAM": "Unknown", "Env": env_var, "Error": str(e)}
+        return {"GPU": gpu_name, "VRAM": "Unknown", "Env": env_var}
+
+    def get_intel_gpu(self):
+        """Detect Intel GPUs using `lspci` and `/sys/class/drm/` for VRAM info."""
+        gpus = []
+
+        # Step 1: Use lspci to detect Intel GPUs
+        try:
+            output = subprocess.check_output("lspci | grep -i 'VGA compatible controller'", shell=True, text=True)
+            for line in output.splitlines():
+                if "Intel Corporation" in line:
+                    gpu_info = {"GPU": "Intel", "Details": line.strip()}
+                    gpus.append(gpu_info)
+        except subprocess.CalledProcessError:
+            pass  # No Intel GPU found
+
+        # Step 2: Use `/sys/class/drm/` to read VRAM info
+        vram_info = self._read_gpu_memory('/sys/class/drm/card*/device/mem_info_vram_total', "Intel GPU", "ONEAPI_DEVICE_SELECTOR")
+
+        # If lspci found an Intel GPU, add VRAM info
+        if gpus:
+            for gpu in gpus:
+                gpu.update(vram_info)
+        else:
+            gpus.append(vram_info)  # If no lspci match, return VRAM data anyway
+
+        return gpus
+
+    def get_macos_gpu(self):
+        """Detect GPUs on macOS using system_profiler SPDisplaysDataType."""
+        try:
+            output = subprocess.check_output(
+                ["system_profiler", "SPDisplaysDataType"], text=True
+            )
+            gpus = []
+            gpu_info = {}
+            inside_gpu_section = False  # Tracks when we are inside a GPU block
+
+            for line in output.splitlines():
+                line = line.strip()
+
+                # Start detecting a new GPU section
+                if line.endswith(":") and "Displays:" not in line:
+                    if gpu_info:  # Store the previous GPU before starting a new one
+                        gpus.append(gpu_info)
+                        gpu_info = {}
+                    inside_gpu_section = True
+                    gpu_info["GPU"] = line[:-1]  # Remove trailing colon from GPU name
+
+                elif inside_gpu_section:
+                    if "Chipset Model:" in line:
+                        gpu_info["GPU"] = line.split(":")[1].strip()
+                    elif "Total Number of Cores:" in line:
+                        gpu_info["Cores"] = line.split(":")[1].strip()
+                    elif "Vendor:" in line:
+                        gpu_info["Vendor"] = line.split(":")[1].strip()
+                    elif "Metal Support:" in line:
+                        gpu_info["Metal"] = line.split(":")[1].strip()
+
+            # Ensure the last detected GPU is added
+            if gpu_info:
+                gpus.append(gpu_info)
+
+            if not gpus:
+                logging.warning("No GPUs detected on macOS.")
+                return [{"GPU": "Unknown", "Error": "No GPU detected on macOS"}]
+
+            return gpus
+
+        except subprocess.CalledProcessError as e:
+            logging.error(f"Failed to detect GPU on macOS: {e}")
+            return [{"GPU": "Unknown", "Error": "Failed to detect GPU on macOS"}]
+        except Exception as e:
+            logging.error(f"Unexpected error while detecting macOS GPU: {e}")
+            return [{"GPU": "Unknown", "Error": str(e)}]
+
+
+    def detect_best_gpu(self, gpu_template):
+        """
+        Compares Nvidia, AMD, Apple, and Intel GPUs and appends the best GPU
+        with the highest VRAM to gpu_template.
+        If one type of GPU fails, it continues to the next type.
+        """
+        system = platform.system()
+        best_gpu = None
+        best_vram = 0
+        best_env = None  # For CUDA, ONEAPI, Metal, etc.
+
+        if system == "Linux":
+            try:
+                nvidia_gpus = self.get_nvidia_gpu()
+                for gpu in nvidia_gpus:
+                    vram = int(gpu.get("VRAM", "0 MiB").split()[0])
+                    if vram > best_vram:
+                        best_gpu = gpu
+                        best_vram = vram
+                        best_env = "CUDA"
+            except RuntimeError as e:
+                logging.warning(f"Warning: NVIDIA detection failed: {e}")
+
+            try:
+                amd_gpus = self.get_amd_gpu()
+                for gpu in amd_gpus:
+                    vram = int(gpu.get("VRAM", "0 MiB").split()[0])
+                    if vram > best_vram:
+                        best_gpu = gpu
+                        best_vram = vram
+                        best_env = "ROCm"
+            except RuntimeError as e:
+                logging.warning(f"Warning: AMD detection failed: {e}")
+
+            try:
+                intel_gpus = self.get_intel_gpu()
+                for gpu in intel_gpus:
+                    vram = int(gpu.get("VRAM", "0 MiB").split()[0])
+                    if vram > best_vram:
+                        best_gpu = gpu
+                        best_vram = vram
+                        best_env = "ONEAPI_DEVICE_SELECTOR"
+            except RuntimeError as e:
+                logging.warning(f"Warning: Intel detection failed: {e}")
+
+        elif system == "Darwin":  # macOS
+            try:
+                macos_gpus = self.get_macos_gpu()
+                for gpu in macos_gpus:
+                    vram = int(gpu.get("VRAM", "0 MiB").split()[0])
+                    if vram > best_vram:
+                        best_gpu = gpu
+                        best_vram = vram
+                        best_env = "Metal"  # Apple uses Metal for GPU acceleration
+            except RuntimeError as e:
+                logging.warning(f"Warning: macOS GPU detection failed: {e}")
+
+        else:
+            raise RuntimeError(f"GPU detection is not supported on {system}.")
+
+        if best_gpu is not None:
+            gpu_template.append({
+                "index": best_gpu["GPU"],
+                "vram": f"{best_vram} MiB",
+                "env": best_env
+            })
+            return True  # GPU detected and added successfully
+        else:
+            logging.warning("No compatible GPUs found.")
+            return False  # No GPU found