1use crate::aggregation::AggregatedMetrics;
11use std::sync::Arc;
12use tokio::sync::RwLock;
13use tracing::{debug, warn};
14
15pub fn nvidia_smi_available() -> bool {
17 std::process::Command::new("nvidia-smi")
18 .arg("--query-gpu=count")
19 .arg("--format=csv,noheader")
20 .output()
21 .map(|o| o.status.success())
22 .unwrap_or(false)
23}
24
25pub async fn run_fallback(metrics: Arc<RwLock<AggregatedMetrics>>, interval_ms: u64) {
28 let mut interval = tokio::time::interval(tokio::time::Duration::from_millis(interval_ms));
29
30 loop {
31 interval.tick().await;
32
33 if let Some(gpu_data) = poll_nvidia_smi().await {
35 let mut m = metrics.write().await;
36 let now_ms = std::time::SystemTime::now()
37 .duration_since(std::time::UNIX_EPOCH)
38 .unwrap_or_default()
39 .as_millis() as u64;
40 m.last_update_ms = now_ms;
41
42 for gpu in &gpu_data {
43 m.record_gpu_mem(
44 0, gpu.index,
46 gpu.memory_used_bytes,
47 gpu.memory_total_bytes,
48 );
49 }
50
51 debug!(gpus = gpu_data.len(), "fallback: nvidia-smi poll complete");
52 }
53
54 #[cfg(target_os = "linux")]
56 if let Some(iowait_pct) = poll_proc_stat() {
57 debug!(iowait_pct, "fallback: /proc/stat iowait");
58 }
59 }
60}
61
62#[derive(Debug)]
63struct GpuMetrics {
64 index: u32,
65 memory_used_bytes: u64,
66 memory_total_bytes: u64,
67 utilization_pct: u32,
68 temperature_c: u32,
69}
70
71async fn poll_nvidia_smi() -> Option<Vec<GpuMetrics>> {
73 let output = tokio::process::Command::new("nvidia-smi")
74 .args([
75 "--query-gpu=index,memory.used,memory.total,utilization.gpu,temperature.gpu",
76 "--format=csv,noheader,nounits",
77 ])
78 .output()
79 .await
80 .ok()?;
81
82 if !output.status.success() {
83 warn!("nvidia-smi query failed");
84 return None;
85 }
86
87 let stdout = String::from_utf8_lossy(&output.stdout);
88 let mut gpus = Vec::new();
89
90 for line in stdout.lines() {
91 let fields: Vec<&str> = line.split(',').map(|s| s.trim()).collect();
92 if fields.len() < 5 {
93 continue;
94 }
95
96 let index = fields[0].parse::<u32>().unwrap_or(0);
97 let mem_used_mib = fields[1].parse::<u64>().unwrap_or(0);
98 let mem_total_mib = fields[2].parse::<u64>().unwrap_or(0);
99 let util_pct = fields[3].parse::<u32>().unwrap_or(0);
100 let temp = fields[4].parse::<u32>().unwrap_or(0);
101
102 gpus.push(GpuMetrics {
103 index,
104 memory_used_bytes: mem_used_mib * 1024 * 1024,
105 memory_total_bytes: mem_total_mib * 1024 * 1024,
106 utilization_pct: util_pct,
107 temperature_c: temp,
108 });
109 }
110
111 if gpus.is_empty() {
112 None
113 } else {
114 Some(gpus)
115 }
116}
117
118#[cfg(target_os = "linux")]
120fn poll_proc_stat() -> Option<f64> {
121 let content = std::fs::read_to_string("/proc/stat").ok()?;
122 let cpu_line = content.lines().next()?;
123 let fields: Vec<&str> = cpu_line.split_whitespace().collect();
124
125 if fields.len() < 8 || fields[0] != "cpu" {
127 return None;
128 }
129
130 let user: u64 = fields[1].parse().ok()?;
131 let nice: u64 = fields[2].parse().ok()?;
132 let system: u64 = fields[3].parse().ok()?;
133 let idle: u64 = fields[4].parse().ok()?;
134 let iowait: u64 = fields[5].parse().ok()?;
135 let irq: u64 = fields[6].parse().ok()?;
136 let softirq: u64 = fields[7].parse().ok()?;
137
138 let total = user + nice + system + idle + iowait + irq + softirq;
139 if total == 0 {
140 return None;
141 }
142
143 Some(iowait as f64 / total as f64 * 100.0)
144}
145
146#[cfg(test)]
147mod tests {
148 use super::*;
149
150 #[test]
151 fn parse_nvidia_smi_csv() {
152 let csv = "0, 45678, 81920, 94, 72\n1, 43210, 81920, 87, 68\n";
154 let mut gpus = Vec::new();
155 for line in csv.lines() {
156 let fields: Vec<&str> = line.split(',').map(|s| s.trim()).collect();
157 if fields.len() >= 5 {
158 gpus.push(GpuMetrics {
159 index: fields[0].parse().unwrap(),
160 memory_used_bytes: fields[1].parse::<u64>().unwrap() * 1024 * 1024,
161 memory_total_bytes: fields[2].parse::<u64>().unwrap() * 1024 * 1024,
162 utilization_pct: fields[3].parse().unwrap(),
163 temperature_c: fields[4].parse().unwrap(),
164 });
165 }
166 }
167 assert_eq!(gpus.len(), 2);
168 assert_eq!(gpus[0].utilization_pct, 94);
169 assert_eq!(gpus[1].memory_used_bytes, 43210 * 1024 * 1024);
170 }
171
172 #[cfg(target_os = "linux")]
173 #[test]
174 fn parse_proc_stat() {
175 let result = poll_proc_stat();
176 assert!(result.is_some());
178 }
179}