zernel/commands/
fleet.rs

1// Copyright (C) 2026 Dyber, Inc. — Proprietary
2
3//! zernel fleet — GPU fleet management at scale
4//!
5//! Designed for AI labs running 100-10,000+ GPUs. Provides:
6//! - Fleet-wide GPU utilization dashboard
7//! - Cost attribution per team/project
8//! - Idle GPU detection and automatic reclamation
9//! - Right-sizing recommendations
10//! - Capacity planning
11
12use anyhow::Result;
13use clap::Subcommand;
14use std::process::Command;
15
16#[derive(Subcommand)]
17pub enum FleetCommands {
18    /// Fleet-wide GPU utilization overview
19    Status,
20    /// Show cost attribution by team/project
21    Costs {
22        /// Time period (today, week, month)
23        #[arg(long, default_value = "month")]
24        period: String,
25    },
26    /// Detect idle GPUs across the fleet
27    Idle {
28        /// Utilization threshold (%) below which a GPU is "idle"
29        #[arg(long, default_value = "5")]
30        threshold: u32,
31        /// Duration (minutes) a GPU must be idle before flagging
32        #[arg(long, default_value = "30")]
33        duration: u32,
34    },
35    /// Reclaim idle GPUs (reassign or power down)
36    Reclaim {
37        /// Dry run (show what would be reclaimed)
38        #[arg(long)]
39        dry_run: bool,
40    },
41    /// Right-sizing recommendations
42    Rightsize,
43    /// Capacity planning — predict when you'll need more GPUs
44    Plan {
45        /// Growth rate (% per month)
46        #[arg(long, default_value = "10")]
47        growth: f64,
48    },
49    /// Fleet health report
50    Health,
51}
52
53pub async fn run(cmd: FleetCommands) -> Result<()> {
54    match cmd {
55        FleetCommands::Status => {
56            println!("Zernel Fleet Status");
57            println!("{}", "=".repeat(70));
58
59            // Load cluster nodes
60            let cluster_file = crate::experiments::tracker::zernel_dir()
61                .join("cluster")
62                .join("nodes.json");
63
64            if !cluster_file.exists() {
65                println!("No fleet configured. Add nodes: zernel cluster add <host> --gpus 8");
66                println!();
67                println!("For single-node fleet status:");
68                let output = Command::new("nvidia-smi")
69                    .args([
70                        "--query-gpu=index,name,utilization.gpu,memory.used,memory.total,temperature.gpu,power.draw",
71                        "--format=csv,noheader,nounits",
72                    ])
73                    .output();
74
75                if let Ok(o) = output {
76                    let stdout = String::from_utf8_lossy(&o.stdout);
77                    let mut total_gpus = 0u32;
78                    let mut total_util = 0u32;
79                    let mut total_power = 0.0f64;
80
81                    println!(
82                        "{:<5} {:<20} {:>6} {:>12} {:>6} {:>8}",
83                        "GPU", "Name", "Util", "Memory", "Temp", "Power"
84                    );
85                    println!("{}", "-".repeat(70));
86
87                    for line in stdout.lines() {
88                        let f: Vec<&str> = line.split(',').map(|s| s.trim()).collect();
89                        if f.len() >= 7 {
90                            let util: u32 = f[2].parse().unwrap_or(0);
91                            let power: f64 = f[6].parse().unwrap_or(0.0);
92                            total_gpus += 1;
93                            total_util += util;
94                            total_power += power;
95                            println!(
96                                "{:<5} {:<20} {:>4}% {:>5}/{:<5}MB {:>4}°C {:>6.0}W",
97                                f[0], f[1], f[2], f[3], f[4], f[5], power
98                            );
99                        }
100                    }
101
102                    if total_gpus > 0 {
103                        println!();
104                        println!("Fleet Summary:");
105                        println!("  Total GPUs:     {total_gpus}");
106                        println!("  Avg utilization: {}%", total_util / total_gpus);
107                        println!(
108                            "  Total power:     {total_power:.0}W ({:.1} kW)",
109                            total_power / 1000.0
110                        );
111                        println!(
112                            "  Est. daily cost: ${:.0} (at $0.10/kWh)",
113                            total_power / 1000.0 * 24.0 * 0.10
114                        );
115                    }
116                }
117                return Ok(());
118            }
119
120            // Multi-node fleet status would SSH to each node here
121            println!("Multi-node fleet status: use `zernel cluster status`");
122        }
123
124        FleetCommands::Costs { period } => {
125            println!("Fleet Cost Attribution — {period}");
126            println!("{}", "=".repeat(60));
127
128            // Calculate from experiments and jobs databases
129            let exp_db = crate::experiments::tracker::experiments_db_path();
130            if exp_db.exists() {
131                let conn = rusqlite::Connection::open(&exp_db)?;
132                let total_secs: f64 = conn
133                    .query_row(
134                        "SELECT COALESCE(SUM(duration_secs), 0) FROM experiments",
135                        [],
136                        |row| row.get(0),
137                    )
138                    .unwrap_or(0.0);
139
140                let total_hours = total_secs / 3600.0;
141                let exp_count: u32 = conn
142                    .query_row("SELECT COUNT(*) FROM experiments", [], |row| row.get(0))
143                    .unwrap_or(0);
144
145                println!("  Experiments:     {exp_count}");
146                println!("  Total GPU-hours: {total_hours:.1}h");
147                println!();
148
149                // Cost estimates at various price points
150                println!("  Estimated costs:");
151                println!(
152                    "    At $2.50/GPU-hr (A100 on-demand):  ${:.0}",
153                    total_hours * 2.50
154                );
155                println!(
156                    "    At $1.50/GPU-hr (A100 reserved):   ${:.0}",
157                    total_hours * 1.50
158                );
159                println!(
160                    "    At $4.00/GPU-hr (H100 on-demand):  ${:.0}",
161                    total_hours * 4.00
162                );
163                println!(
164                    "    At $0.10/kWh (on-prem electricity): ${:.0}",
165                    total_hours * 0.3 * 0.10
166                );
167            } else {
168                println!("  No experiment data. Run: zernel run <script>");
169            }
170        }
171
172        FleetCommands::Idle {
173            threshold,
174            duration,
175        } => {
176            println!("Idle GPU Detection (threshold: <{threshold}% for >{duration}min)");
177            println!("{}", "=".repeat(60));
178
179            let output = Command::new("nvidia-smi")
180                .args([
181                    "--query-gpu=index,name,utilization.gpu",
182                    "--format=csv,noheader,nounits",
183                ])
184                .output();
185
186            if let Ok(o) = output {
187                let stdout = String::from_utf8_lossy(&o.stdout);
188                let mut idle_count = 0;
189                for line in stdout.lines() {
190                    let f: Vec<&str> = line.split(',').map(|s| s.trim()).collect();
191                    if f.len() >= 3 {
192                        let util: u32 = f[2].parse().unwrap_or(0);
193                        if util < threshold {
194                            println!("  GPU {}: {} — {}% (IDLE)", f[0], f[1], util);
195                            idle_count += 1;
196                        }
197                    }
198                }
199
200                if idle_count == 0 {
201                    println!("  No idle GPUs detected.");
202                } else {
203                    println!();
204                    println!("  {idle_count} idle GPU(s) detected.");
205                    println!("  Reclaim with: zernel fleet reclaim");
206                    println!("  Estimated savings: ${:.0}/day per idle GPU", 24.0 * 2.50);
207                }
208            }
209        }
210
211        FleetCommands::Reclaim { dry_run } => {
212            if dry_run {
213                println!("Dry run — showing what would be reclaimed:");
214            } else {
215                println!("Reclaiming idle GPUs...");
216            }
217
218            let output = Command::new("nvidia-smi")
219                .args([
220                    "--query-gpu=index,utilization.gpu",
221                    "--format=csv,noheader,nounits",
222                ])
223                .output();
224
225            if let Ok(o) = output {
226                let stdout = String::from_utf8_lossy(&o.stdout);
227                for line in stdout.lines() {
228                    let f: Vec<&str> = line.split(',').map(|s| s.trim()).collect();
229                    if f.len() >= 2 {
230                        let util: u32 = f[1].parse().unwrap_or(100);
231                        if util < 5 {
232                            println!("  GPU {}: {}% utilized", f[0], util);
233                            if !dry_run {
234                                // Lower power state
235                                let _ = Command::new("nvidia-smi")
236                                    .args(["-i", f[0], "-pl", "50"])
237                                    .output();
238                                println!("    → Power reduced to 50W (idle mode)");
239                            } else {
240                                println!("    → Would reduce power to idle mode");
241                            }
242                        }
243                    }
244                }
245            }
246        }
247
248        FleetCommands::Rightsize => {
249            println!("GPU Right-Sizing Recommendations");
250            println!("{}", "=".repeat(60));
251
252            let exp_db = crate::experiments::tracker::experiments_db_path();
253            if exp_db.exists() {
254                println!("Based on your training history:");
255                println!();
256                println!("  Recommendation: Analyze GPU memory watermarks and utilization");
257                println!("  patterns from your experiments to determine optimal GPU type.");
258                println!();
259                println!("  If avg GPU memory < 40GB: Consider A100 40GB (cheaper)");
260                println!("  If avg GPU memory < 24GB: Consider A10G or L4 (much cheaper)");
261                println!("  If avg GPU util < 50%: Reduce GPU count or use smaller GPUs");
262                println!("  If NCCL is >20% of step time: Need faster interconnect (NVLink/IB)");
263                println!();
264                println!("  Run: zernel bench all — to generate utilization profile");
265                println!("  Run: zernel debug why-slow — to identify bottlenecks");
266            } else {
267                println!("  Need training data. Run experiments first: zernel run <script>");
268            }
269        }
270
271        FleetCommands::Plan { growth } => {
272            println!("Capacity Planning (growth: {growth}%/month)");
273            println!("{}", "=".repeat(60));
274
275            let output = Command::new("nvidia-smi")
276                .args(["--query-gpu=count", "--format=csv,noheader"])
277                .output();
278
279            let current_gpus: u32 = output
280                .ok()
281                .and_then(|o| {
282                    String::from_utf8_lossy(&o.stdout)
283                        .trim()
284                        .lines()
285                        .next()
286                        .and_then(|s| s.parse().ok())
287                })
288                .unwrap_or(0);
289
290            println!("  Current GPUs: {current_gpus}");
291            println!();
292            println!("  Projected need (at {growth}% monthly growth):");
293
294            let mut gpus = current_gpus as f64;
295            for month in 1..=12 {
296                gpus *= 1.0 + growth / 100.0;
297                let cost_ondemand = gpus * 24.0 * 30.0 * 2.50;
298                let cost_reserved = gpus * 24.0 * 30.0 * 1.50;
299                println!(
300                    "    Month {month:>2}: {:>4.0} GPUs — ${cost_reserved:.0}-${cost_ondemand:.0}/mo",
301                    gpus
302                );
303            }
304        }
305
306        FleetCommands::Health => {
307            println!("Fleet Health Report");
308            println!("{}", "=".repeat(60));
309
310            // Check each subsystem
311            let checks = [
312                ("nvidia-smi", "GPU drivers"),
313                ("zernel", "Zernel CLI"),
314                ("python3", "Python runtime"),
315            ];
316
317            for (cmd, name) in &checks {
318                let ok = Command::new(cmd)
319                    .arg("--version")
320                    .output()
321                    .map(|o| o.status.success())
322                    .unwrap_or(false);
323                println!("  {name:<25} {}", if ok { "OK" } else { "MISSING" });
324            }
325
326            // Check zerneld
327            let zerneld_ok = std::net::TcpStream::connect_timeout(
328                &"127.0.0.1:9091".parse().expect("valid"),
329                std::time::Duration::from_millis(500),
330            )
331            .is_ok();
332            println!(
333                "  {:<25} {}",
334                "zerneld (observability)",
335                if zerneld_ok { "RUNNING" } else { "STOPPED" }
336            );
337
338            // Check dashboard
339            let dash_ok = std::net::TcpStream::connect_timeout(
340                &"127.0.0.1:3000".parse().expect("valid"),
341                std::time::Duration::from_millis(500),
342            )
343            .is_ok();
344            println!(
345                "  {:<25} {}",
346                "zernel-dashboard (web)",
347                if dash_ok { "RUNNING" } else { "STOPPED" }
348            );
349
350            // Check ollama
351            let ollama_ok = std::net::TcpStream::connect_timeout(
352                &"127.0.0.1:11434".parse().expect("valid"),
353                std::time::Duration::from_millis(500),
354            )
355            .is_ok();
356            println!(
357                "  {:<25} {}",
358                "ollama (local LLM)",
359                if ollama_ok { "RUNNING" } else { "STOPPED" }
360            );
361        }
362    }
363    Ok(())
364}