1use anyhow::Result;
13use clap::Subcommand;
14use std::process::Command;
15
16#[derive(Subcommand)]
17pub enum FleetCommands {
18 Status,
20 Costs {
22 #[arg(long, default_value = "month")]
24 period: String,
25 },
26 Idle {
28 #[arg(long, default_value = "5")]
30 threshold: u32,
31 #[arg(long, default_value = "30")]
33 duration: u32,
34 },
35 Reclaim {
37 #[arg(long)]
39 dry_run: bool,
40 },
41 Rightsize,
43 Plan {
45 #[arg(long, default_value = "10")]
47 growth: f64,
48 },
49 Health,
51}
52
53pub async fn run(cmd: FleetCommands) -> Result<()> {
54 match cmd {
55 FleetCommands::Status => {
56 println!("Zernel Fleet Status");
57 println!("{}", "=".repeat(70));
58
59 let cluster_file = crate::experiments::tracker::zernel_dir()
61 .join("cluster")
62 .join("nodes.json");
63
64 if !cluster_file.exists() {
65 println!("No fleet configured. Add nodes: zernel cluster add <host> --gpus 8");
66 println!();
67 println!("For single-node fleet status:");
68 let output = Command::new("nvidia-smi")
69 .args([
70 "--query-gpu=index,name,utilization.gpu,memory.used,memory.total,temperature.gpu,power.draw",
71 "--format=csv,noheader,nounits",
72 ])
73 .output();
74
75 if let Ok(o) = output {
76 let stdout = String::from_utf8_lossy(&o.stdout);
77 let mut total_gpus = 0u32;
78 let mut total_util = 0u32;
79 let mut total_power = 0.0f64;
80
81 println!(
82 "{:<5} {:<20} {:>6} {:>12} {:>6} {:>8}",
83 "GPU", "Name", "Util", "Memory", "Temp", "Power"
84 );
85 println!("{}", "-".repeat(70));
86
87 for line in stdout.lines() {
88 let f: Vec<&str> = line.split(',').map(|s| s.trim()).collect();
89 if f.len() >= 7 {
90 let util: u32 = f[2].parse().unwrap_or(0);
91 let power: f64 = f[6].parse().unwrap_or(0.0);
92 total_gpus += 1;
93 total_util += util;
94 total_power += power;
95 println!(
96 "{:<5} {:<20} {:>4}% {:>5}/{:<5}MB {:>4}°C {:>6.0}W",
97 f[0], f[1], f[2], f[3], f[4], f[5], power
98 );
99 }
100 }
101
102 if total_gpus > 0 {
103 println!();
104 println!("Fleet Summary:");
105 println!(" Total GPUs: {total_gpus}");
106 println!(" Avg utilization: {}%", total_util / total_gpus);
107 println!(
108 " Total power: {total_power:.0}W ({:.1} kW)",
109 total_power / 1000.0
110 );
111 println!(
112 " Est. daily cost: ${:.0} (at $0.10/kWh)",
113 total_power / 1000.0 * 24.0 * 0.10
114 );
115 }
116 }
117 return Ok(());
118 }
119
120 println!("Multi-node fleet status: use `zernel cluster status`");
122 }
123
124 FleetCommands::Costs { period } => {
125 println!("Fleet Cost Attribution — {period}");
126 println!("{}", "=".repeat(60));
127
128 let exp_db = crate::experiments::tracker::experiments_db_path();
130 if exp_db.exists() {
131 let conn = rusqlite::Connection::open(&exp_db)?;
132 let total_secs: f64 = conn
133 .query_row(
134 "SELECT COALESCE(SUM(duration_secs), 0) FROM experiments",
135 [],
136 |row| row.get(0),
137 )
138 .unwrap_or(0.0);
139
140 let total_hours = total_secs / 3600.0;
141 let exp_count: u32 = conn
142 .query_row("SELECT COUNT(*) FROM experiments", [], |row| row.get(0))
143 .unwrap_or(0);
144
145 println!(" Experiments: {exp_count}");
146 println!(" Total GPU-hours: {total_hours:.1}h");
147 println!();
148
149 println!(" Estimated costs:");
151 println!(
152 " At $2.50/GPU-hr (A100 on-demand): ${:.0}",
153 total_hours * 2.50
154 );
155 println!(
156 " At $1.50/GPU-hr (A100 reserved): ${:.0}",
157 total_hours * 1.50
158 );
159 println!(
160 " At $4.00/GPU-hr (H100 on-demand): ${:.0}",
161 total_hours * 4.00
162 );
163 println!(
164 " At $0.10/kWh (on-prem electricity): ${:.0}",
165 total_hours * 0.3 * 0.10
166 );
167 } else {
168 println!(" No experiment data. Run: zernel run <script>");
169 }
170 }
171
172 FleetCommands::Idle {
173 threshold,
174 duration,
175 } => {
176 println!("Idle GPU Detection (threshold: <{threshold}% for >{duration}min)");
177 println!("{}", "=".repeat(60));
178
179 let output = Command::new("nvidia-smi")
180 .args([
181 "--query-gpu=index,name,utilization.gpu",
182 "--format=csv,noheader,nounits",
183 ])
184 .output();
185
186 if let Ok(o) = output {
187 let stdout = String::from_utf8_lossy(&o.stdout);
188 let mut idle_count = 0;
189 for line in stdout.lines() {
190 let f: Vec<&str> = line.split(',').map(|s| s.trim()).collect();
191 if f.len() >= 3 {
192 let util: u32 = f[2].parse().unwrap_or(0);
193 if util < threshold {
194 println!(" GPU {}: {} — {}% (IDLE)", f[0], f[1], util);
195 idle_count += 1;
196 }
197 }
198 }
199
200 if idle_count == 0 {
201 println!(" No idle GPUs detected.");
202 } else {
203 println!();
204 println!(" {idle_count} idle GPU(s) detected.");
205 println!(" Reclaim with: zernel fleet reclaim");
206 println!(" Estimated savings: ${:.0}/day per idle GPU", 24.0 * 2.50);
207 }
208 }
209 }
210
211 FleetCommands::Reclaim { dry_run } => {
212 if dry_run {
213 println!("Dry run — showing what would be reclaimed:");
214 } else {
215 println!("Reclaiming idle GPUs...");
216 }
217
218 let output = Command::new("nvidia-smi")
219 .args([
220 "--query-gpu=index,utilization.gpu",
221 "--format=csv,noheader,nounits",
222 ])
223 .output();
224
225 if let Ok(o) = output {
226 let stdout = String::from_utf8_lossy(&o.stdout);
227 for line in stdout.lines() {
228 let f: Vec<&str> = line.split(',').map(|s| s.trim()).collect();
229 if f.len() >= 2 {
230 let util: u32 = f[1].parse().unwrap_or(100);
231 if util < 5 {
232 println!(" GPU {}: {}% utilized", f[0], util);
233 if !dry_run {
234 let _ = Command::new("nvidia-smi")
236 .args(["-i", f[0], "-pl", "50"])
237 .output();
238 println!(" → Power reduced to 50W (idle mode)");
239 } else {
240 println!(" → Would reduce power to idle mode");
241 }
242 }
243 }
244 }
245 }
246 }
247
248 FleetCommands::Rightsize => {
249 println!("GPU Right-Sizing Recommendations");
250 println!("{}", "=".repeat(60));
251
252 let exp_db = crate::experiments::tracker::experiments_db_path();
253 if exp_db.exists() {
254 println!("Based on your training history:");
255 println!();
256 println!(" Recommendation: Analyze GPU memory watermarks and utilization");
257 println!(" patterns from your experiments to determine optimal GPU type.");
258 println!();
259 println!(" If avg GPU memory < 40GB: Consider A100 40GB (cheaper)");
260 println!(" If avg GPU memory < 24GB: Consider A10G or L4 (much cheaper)");
261 println!(" If avg GPU util < 50%: Reduce GPU count or use smaller GPUs");
262 println!(" If NCCL is >20% of step time: Need faster interconnect (NVLink/IB)");
263 println!();
264 println!(" Run: zernel bench all — to generate utilization profile");
265 println!(" Run: zernel debug why-slow — to identify bottlenecks");
266 } else {
267 println!(" Need training data. Run experiments first: zernel run <script>");
268 }
269 }
270
271 FleetCommands::Plan { growth } => {
272 println!("Capacity Planning (growth: {growth}%/month)");
273 println!("{}", "=".repeat(60));
274
275 let output = Command::new("nvidia-smi")
276 .args(["--query-gpu=count", "--format=csv,noheader"])
277 .output();
278
279 let current_gpus: u32 = output
280 .ok()
281 .and_then(|o| {
282 String::from_utf8_lossy(&o.stdout)
283 .trim()
284 .lines()
285 .next()
286 .and_then(|s| s.parse().ok())
287 })
288 .unwrap_or(0);
289
290 println!(" Current GPUs: {current_gpus}");
291 println!();
292 println!(" Projected need (at {growth}% monthly growth):");
293
294 let mut gpus = current_gpus as f64;
295 for month in 1..=12 {
296 gpus *= 1.0 + growth / 100.0;
297 let cost_ondemand = gpus * 24.0 * 30.0 * 2.50;
298 let cost_reserved = gpus * 24.0 * 30.0 * 1.50;
299 println!(
300 " Month {month:>2}: {:>4.0} GPUs — ${cost_reserved:.0}-${cost_ondemand:.0}/mo",
301 gpus
302 );
303 }
304 }
305
306 FleetCommands::Health => {
307 println!("Fleet Health Report");
308 println!("{}", "=".repeat(60));
309
310 let checks = [
312 ("nvidia-smi", "GPU drivers"),
313 ("zernel", "Zernel CLI"),
314 ("python3", "Python runtime"),
315 ];
316
317 for (cmd, name) in &checks {
318 let ok = Command::new(cmd)
319 .arg("--version")
320 .output()
321 .map(|o| o.status.success())
322 .unwrap_or(false);
323 println!(" {name:<25} {}", if ok { "OK" } else { "MISSING" });
324 }
325
326 let zerneld_ok = std::net::TcpStream::connect_timeout(
328 &"127.0.0.1:9091".parse().expect("valid"),
329 std::time::Duration::from_millis(500),
330 )
331 .is_ok();
332 println!(
333 " {:<25} {}",
334 "zerneld (observability)",
335 if zerneld_ok { "RUNNING" } else { "STOPPED" }
336 );
337
338 let dash_ok = std::net::TcpStream::connect_timeout(
340 &"127.0.0.1:3000".parse().expect("valid"),
341 std::time::Duration::from_millis(500),
342 )
343 .is_ok();
344 println!(
345 " {:<25} {}",
346 "zernel-dashboard (web)",
347 if dash_ok { "RUNNING" } else { "STOPPED" }
348 );
349
350 let ollama_ok = std::net::TcpStream::connect_timeout(
352 &"127.0.0.1:11434".parse().expect("valid"),
353 std::time::Duration::from_millis(500),
354 )
355 .is_ok();
356 println!(
357 " {:<25} {}",
358 "ollama (local LLM)",
359 if ollama_ok { "RUNNING" } else { "STOPPED" }
360 );
361 }
362 }
363 Ok(())
364}