zernel_ebpf/
fallback.rs

1// Copyright (C) 2026 Dyber, Inc. — GPL-2.0
2
3//! Fallback telemetry provider using nvidia-smi and /proc.
4//!
5//! When BPF probes are unavailable (non-Linux, no root, kernel < 6.12),
6//! this module polls nvidia-smi and /proc to populate the same
7//! AggregatedMetrics that BPF ring buffers would. This provides REAL
8//! GPU telemetry without any kernel instrumentation.
9
10use crate::aggregation::AggregatedMetrics;
11use std::sync::Arc;
12use tokio::sync::RwLock;
13use tracing::{debug, warn};
14
15/// Check if nvidia-smi is available on this system.
16pub fn nvidia_smi_available() -> bool {
17    std::process::Command::new("nvidia-smi")
18        .arg("--query-gpu=count")
19        .arg("--format=csv,noheader")
20        .output()
21        .map(|o| o.status.success())
22        .unwrap_or(false)
23}
24
25/// Run the fallback telemetry poller using nvidia-smi.
26/// Produces real GPU memory and utilization data without BPF.
27pub async fn run_fallback(metrics: Arc<RwLock<AggregatedMetrics>>, interval_ms: u64) {
28    let mut interval = tokio::time::interval(tokio::time::Duration::from_millis(interval_ms));
29
30    loop {
31        interval.tick().await;
32
33        // Poll nvidia-smi for GPU metrics
34        if let Some(gpu_data) = poll_nvidia_smi().await {
35            let mut m = metrics.write().await;
36            let now_ms = std::time::SystemTime::now()
37                .duration_since(std::time::UNIX_EPOCH)
38                .unwrap_or_default()
39                .as_millis() as u64;
40            m.last_update_ms = now_ms;
41
42            for gpu in &gpu_data {
43                m.record_gpu_mem(
44                    0, // pid 0 = system-level (not per-process in fallback mode)
45                    gpu.index,
46                    gpu.memory_used_bytes,
47                    gpu.memory_total_bytes,
48                );
49            }
50
51            debug!(gpus = gpu_data.len(), "fallback: nvidia-smi poll complete");
52        }
53
54        // Poll /proc/stat for CPU iowait (Linux only)
55        #[cfg(target_os = "linux")]
56        if let Some(iowait_pct) = poll_proc_stat() {
57            debug!(iowait_pct, "fallback: /proc/stat iowait");
58        }
59    }
60}
61
62#[derive(Debug)]
63struct GpuMetrics {
64    index: u32,
65    memory_used_bytes: u64,
66    memory_total_bytes: u64,
67    utilization_pct: u32,
68    temperature_c: u32,
69}
70
71/// Poll nvidia-smi CSV output for GPU metrics.
72async fn poll_nvidia_smi() -> Option<Vec<GpuMetrics>> {
73    let output = tokio::process::Command::new("nvidia-smi")
74        .args([
75            "--query-gpu=index,memory.used,memory.total,utilization.gpu,temperature.gpu",
76            "--format=csv,noheader,nounits",
77        ])
78        .output()
79        .await
80        .ok()?;
81
82    if !output.status.success() {
83        warn!("nvidia-smi query failed");
84        return None;
85    }
86
87    let stdout = String::from_utf8_lossy(&output.stdout);
88    let mut gpus = Vec::new();
89
90    for line in stdout.lines() {
91        let fields: Vec<&str> = line.split(',').map(|s| s.trim()).collect();
92        if fields.len() < 5 {
93            continue;
94        }
95
96        let index = fields[0].parse::<u32>().unwrap_or(0);
97        let mem_used_mib = fields[1].parse::<u64>().unwrap_or(0);
98        let mem_total_mib = fields[2].parse::<u64>().unwrap_or(0);
99        let util_pct = fields[3].parse::<u32>().unwrap_or(0);
100        let temp = fields[4].parse::<u32>().unwrap_or(0);
101
102        gpus.push(GpuMetrics {
103            index,
104            memory_used_bytes: mem_used_mib * 1024 * 1024,
105            memory_total_bytes: mem_total_mib * 1024 * 1024,
106            utilization_pct: util_pct,
107            temperature_c: temp,
108        });
109    }
110
111    if gpus.is_empty() {
112        None
113    } else {
114        Some(gpus)
115    }
116}
117
118/// Parse /proc/stat for CPU iowait percentage.
119#[cfg(target_os = "linux")]
120fn poll_proc_stat() -> Option<f64> {
121    let content = std::fs::read_to_string("/proc/stat").ok()?;
122    let cpu_line = content.lines().next()?;
123    let fields: Vec<&str> = cpu_line.split_whitespace().collect();
124
125    // /proc/stat format: cpu user nice system idle iowait irq softirq steal
126    if fields.len() < 8 || fields[0] != "cpu" {
127        return None;
128    }
129
130    let user: u64 = fields[1].parse().ok()?;
131    let nice: u64 = fields[2].parse().ok()?;
132    let system: u64 = fields[3].parse().ok()?;
133    let idle: u64 = fields[4].parse().ok()?;
134    let iowait: u64 = fields[5].parse().ok()?;
135    let irq: u64 = fields[6].parse().ok()?;
136    let softirq: u64 = fields[7].parse().ok()?;
137
138    let total = user + nice + system + idle + iowait + irq + softirq;
139    if total == 0 {
140        return None;
141    }
142
143    Some(iowait as f64 / total as f64 * 100.0)
144}
145
146#[cfg(test)]
147mod tests {
148    use super::*;
149
150    #[test]
151    fn parse_nvidia_smi_csv() {
152        // Simulate nvidia-smi CSV output parsing
153        let csv = "0, 45678, 81920, 94, 72\n1, 43210, 81920, 87, 68\n";
154        let mut gpus = Vec::new();
155        for line in csv.lines() {
156            let fields: Vec<&str> = line.split(',').map(|s| s.trim()).collect();
157            if fields.len() >= 5 {
158                gpus.push(GpuMetrics {
159                    index: fields[0].parse().unwrap(),
160                    memory_used_bytes: fields[1].parse::<u64>().unwrap() * 1024 * 1024,
161                    memory_total_bytes: fields[2].parse::<u64>().unwrap() * 1024 * 1024,
162                    utilization_pct: fields[3].parse().unwrap(),
163                    temperature_c: fields[4].parse().unwrap(),
164                });
165            }
166        }
167        assert_eq!(gpus.len(), 2);
168        assert_eq!(gpus[0].utilization_pct, 94);
169        assert_eq!(gpus[1].memory_used_bytes, 43210 * 1024 * 1024);
170    }
171
172    #[cfg(target_os = "linux")]
173    #[test]
174    fn parse_proc_stat() {
175        let result = poll_proc_stat();
176        // On Linux CI, /proc/stat should exist
177        assert!(result.is_some());
178    }
179}