zernel/commands/
watch.rs

1// Copyright (C) 2026 Dyber, Inc. — Proprietary
2
3use crate::telemetry::client::{self, TelemetryClient, TelemetrySnapshot};
4use anyhow::Result;
5use crossterm::{
6    event::{self, Event, KeyCode, KeyEventKind},
7    execute,
8    terminal::{disable_raw_mode, enable_raw_mode, EnterAlternateScreen, LeaveAlternateScreen},
9};
10use ratatui::{
11    backend::CrosstermBackend,
12    layout::{Constraint, Direction, Layout},
13    style::{Color, Modifier, Style},
14    text::{Line, Span},
15    widgets::{Block, Borders, Gauge, Paragraph},
16    Terminal,
17};
18use std::io;
19use tracing::info;
20
21/// Dashboard state — either live from zerneld or simulated demo data.
22struct DashboardState {
23    mode: DashboardMode,
24    tick: u64,
25    gpus: Vec<GpuInfo>,
26    loss: f64,
27    step: u64,
28    total_steps: u64,
29    cuda_p50_us: f64,
30    cuda_p99_us: f64,
31    nccl_p50_ms: f64,
32    nccl_p99_ms: f64,
33    dataloader_ms: f64,
34    pcie_gbps: f64,
35    phase: String,
36}
37
38enum DashboardMode {
39    Demo,
40    Live,
41}
42
43struct GpuInfo {
44    id: u32,
45    util: u8,
46    mem_used_gb: f64,
47    mem_total_gb: f64,
48}
49
50impl DashboardState {
51    fn new_demo() -> Self {
52        Self {
53            mode: DashboardMode::Demo,
54            tick: 0,
55            gpus: vec![
56                GpuInfo {
57                    id: 0,
58                    util: 94,
59                    mem_used_gb: 78.2,
60                    mem_total_gb: 80.0,
61                },
62                GpuInfo {
63                    id: 1,
64                    util: 91,
65                    mem_used_gb: 77.8,
66                    mem_total_gb: 80.0,
67                },
68                GpuInfo {
69                    id: 2,
70                    util: 96,
71                    mem_used_gb: 79.1,
72                    mem_total_gb: 80.0,
73                },
74                GpuInfo {
75                    id: 3,
76                    util: 93,
77                    mem_used_gb: 78.9,
78                    mem_total_gb: 80.0,
79                },
80            ],
81            loss: 1.8,
82            step: 0,
83            total_steps: 10000,
84            cuda_p50_us: 142.0,
85            cuda_p99_us: 891.0,
86            nccl_p50_ms: 34.0,
87            nccl_p99_ms: 67.0,
88            dataloader_ms: 8.0,
89            pcie_gbps: 31.2,
90            phase: "GpuCompute".into(),
91        }
92    }
93
94    fn new_live() -> Self {
95        Self {
96            mode: DashboardMode::Live,
97            tick: 0,
98            gpus: Vec::new(),
99            loss: 0.0,
100            step: 0,
101            total_steps: 0,
102            cuda_p50_us: 0.0,
103            cuda_p99_us: 0.0,
104            nccl_p50_ms: 0.0,
105            nccl_p99_ms: 0.0,
106            dataloader_ms: 0.0,
107            pcie_gbps: 0.0,
108            phase: "Unknown".into(),
109        }
110    }
111
112    /// Update from a real zerneld telemetry snapshot.
113    fn apply_snapshot(&mut self, snap: &TelemetrySnapshot) {
114        self.tick += 1;
115        self.cuda_p50_us = snap.cuda_latency_p50_us;
116        self.cuda_p99_us = snap.cuda_latency_p99_us;
117        self.nccl_p50_ms = snap.nccl_allreduce_p50_ms;
118        self.nccl_p99_ms = snap.nccl_allreduce_p99_ms;
119        self.dataloader_ms = snap.dataloader_wait_p50_ms;
120
121        // Convert GPU memory entries to display info
122        self.gpus.clear();
123        for (i, entry) in snap.gpu_utilization.iter().enumerate() {
124            let total_gb = if entry.peak_bytes > 0 {
125                entry.peak_bytes as f64 / (1024.0 * 1024.0 * 1024.0)
126            } else {
127                80.0
128            };
129            let used_gb = entry.current_bytes as f64 / (1024.0 * 1024.0 * 1024.0);
130            let util = if entry.peak_bytes > 0 {
131                ((entry.current_bytes as f64 / entry.peak_bytes as f64) * 100.0) as u8
132            } else {
133                0
134            };
135            self.gpus.push(GpuInfo {
136                id: i as u32,
137                util,
138                mem_used_gb: used_gb,
139                mem_total_gb: total_gb,
140            });
141        }
142    }
143
144    /// Advance demo state by one tick.
145    fn demo_tick(&mut self) {
146        self.tick += 1;
147        self.step = (self.step + 3).min(self.total_steps);
148        self.loss = (1.8 * (-0.0002 * self.step as f64).exp()).max(0.3);
149
150        for gpu in &mut self.gpus {
151            let jitter = ((self.tick * (gpu.id as u64 + 1) * 7) % 6) as i8 - 3;
152            gpu.util = (gpu.util as i8 + jitter).clamp(85, 99) as u8;
153        }
154
155        let phase_cycle = self.tick % 20;
156        self.phase = match phase_cycle {
157            0..=2 => "DataLoading",
158            3..=14 => "GpuCompute",
159            15..=16 => "NcclCollective",
160            17..=18 => "OptimizerStep",
161            _ => "GpuCompute",
162        }
163        .into();
164    }
165
166    fn mode_label(&self) -> &str {
167        match self.mode {
168            DashboardMode::Demo => " [DEMO] ",
169            DashboardMode::Live => " [LIVE] ",
170        }
171    }
172}
173
174/// Launch the full-screen Ratatui dashboard.
175pub async fn run() -> Result<()> {
176    info!("starting watch dashboard");
177
178    // Try to connect to zerneld
179    let port = client::ws_port();
180    let client = TelemetryClient::new("127.0.0.1", port);
181    let (mut state, mut rx) = match client.try_connect().await {
182        Some(rx) => {
183            info!(port, "connected to zerneld");
184            (DashboardState::new_live(), Some(rx))
185        }
186        None => {
187            info!("zerneld not available, using demo mode");
188            (DashboardState::new_demo(), None)
189        }
190    };
191
192    // Set up terminal
193    enable_raw_mode()?;
194    let mut stdout = io::stdout();
195    execute!(stdout, EnterAlternateScreen)?;
196    let backend = CrosstermBackend::new(stdout);
197    let mut terminal = Terminal::new(backend)?;
198
199    // Main render loop
200    loop {
201        // Update state
202        match &mut rx {
203            Some(receiver) => {
204                // Try to drain latest snapshot (non-blocking)
205                while let Ok(snap) = receiver.try_recv() {
206                    state.apply_snapshot(&snap);
207                }
208            }
209            None => {
210                state.demo_tick();
211            }
212        }
213
214        terminal.draw(|f| render_dashboard(f, &state))?;
215
216        // Handle input (non-blocking, 166ms = ~6fps)
217        if event::poll(std::time::Duration::from_millis(166))? {
218            if let Event::Key(key) = event::read()? {
219                if key.kind == KeyEventKind::Press {
220                    match key.code {
221                        KeyCode::Char('q') | KeyCode::Esc => break,
222                        KeyCode::Char('r') => {
223                            state = match state.mode {
224                                DashboardMode::Demo => DashboardState::new_demo(),
225                                DashboardMode::Live => DashboardState::new_live(),
226                            };
227                        }
228                        _ => {}
229                    }
230                }
231            }
232        }
233    }
234
235    // Restore terminal
236    disable_raw_mode()?;
237    execute!(terminal.backend_mut(), LeaveAlternateScreen)?;
238    terminal.show_cursor()?;
239
240    Ok(())
241}
242
243fn render_dashboard(f: &mut ratatui::Frame, state: &DashboardState) {
244    let chunks = Layout::default()
245        .direction(Direction::Vertical)
246        .constraints([
247            Constraint::Length(3),
248            Constraint::Length(6),
249            Constraint::Length(5),
250            Constraint::Length(6),
251            Constraint::Min(3),
252        ])
253        .split(f.area());
254
255    // Title
256    let elapsed_mins = state.tick / 6;
257    let title = Paragraph::new(Line::from(vec![
258        Span::styled(
259            " Zernel Watch ",
260            Style::default()
261                .fg(Color::Cyan)
262                .add_modifier(Modifier::BOLD),
263        ),
264        Span::styled(
265            state.mode_label(),
266            Style::default().fg(match state.mode {
267                DashboardMode::Demo => Color::Yellow,
268                DashboardMode::Live => Color::Green,
269            }),
270        ),
271        Span::raw(format!(
272            " step: {}/{}  |  elapsed: {}m",
273            state.step, state.total_steps, elapsed_mins
274        )),
275    ]))
276    .block(Block::default().borders(Borders::ALL));
277    f.render_widget(title, chunks[0]);
278
279    // GPU utilization
280    if !state.gpus.is_empty() {
281        let gpu_pct = 100 / state.gpus.len().max(1) as u16;
282        let gpu_constraints: Vec<Constraint> = state
283            .gpus
284            .iter()
285            .map(|_| Constraint::Percentage(gpu_pct))
286            .collect();
287        let gpu_chunks = Layout::default()
288            .direction(Direction::Horizontal)
289            .constraints(gpu_constraints)
290            .split(chunks[1]);
291
292        for (i, gpu) in state.gpus.iter().enumerate() {
293            let color = if gpu.util > 90 {
294                Color::Green
295            } else if gpu.util > 70 {
296                Color::Yellow
297            } else {
298                Color::Red
299            };
300            let gauge = Gauge::default()
301                .block(
302                    Block::default()
303                        .title(format!(
304                            "GPU {} | {:.1}/{:.1} GB",
305                            gpu.id, gpu.mem_used_gb, gpu.mem_total_gb
306                        ))
307                        .borders(Borders::ALL),
308                )
309                .gauge_style(Style::default().fg(color))
310                .percent(gpu.util as u16)
311                .label(format!("{}%", gpu.util));
312            f.render_widget(gauge, gpu_chunks[i]);
313        }
314    } else {
315        let msg = Paragraph::new(" Waiting for GPU data...")
316            .block(Block::default().title(" GPUs ").borders(Borders::ALL));
317        f.render_widget(msg, chunks[1]);
318    }
319
320    // Training metrics
321    let loss_str = format!("{:.4}", state.loss);
322    let progress = if state.total_steps > 0 {
323        state.step as f64 / state.total_steps as f64
324    } else {
325        0.0
326    };
327
328    let metrics_text = vec![
329        Line::from(vec![
330            Span::styled(" loss: ", Style::default().fg(Color::Yellow)),
331            Span::raw(&loss_str),
332            Span::raw(format!("   step: {}/{}", state.step, state.total_steps)),
333        ]),
334        Line::from(vec![
335            Span::styled(" progress: ", Style::default().fg(Color::Yellow)),
336            Span::raw(format!("{:.1}%", progress * 100.0)),
337        ]),
338    ];
339    let metrics = Paragraph::new(metrics_text).block(
340        Block::default()
341            .title(" Training Metrics ")
342            .borders(Borders::ALL),
343    );
344    f.render_widget(metrics, chunks[2]);
345
346    // eBPF telemetry
347    let telem_text = vec![
348        Line::from(format!(
349            " CUDA launch: p50={:.0}us  p99={:.0}us    DataLoader wait: p50={:.0}ms",
350            state.cuda_p50_us, state.cuda_p99_us, state.dataloader_ms
351        )),
352        Line::from(format!(
353            " NCCL allreduce: p50={:.0}ms  p99={:.0}ms    PCIe BW: {:.1} GB/s",
354            state.nccl_p50_ms, state.nccl_p99_ms, state.pcie_gbps
355        )),
356    ];
357    let telem = Paragraph::new(telem_text).block(
358        Block::default()
359            .title(" eBPF Telemetry ")
360            .borders(Borders::ALL),
361    );
362    f.render_widget(telem, chunks[3]);
363
364    // Scheduler phase
365    let phase_color = match state.phase.as_str() {
366        "GpuCompute" => Color::Green,
367        "DataLoading" => Color::Yellow,
368        "NcclCollective" => Color::Cyan,
369        "OptimizerStep" => Color::Magenta,
370        _ => Color::White,
371    };
372    let sched_text = vec![Line::from(vec![
373        Span::raw(" Phase: "),
374        Span::styled(
375            &state.phase,
376            Style::default()
377                .fg(phase_color)
378                .add_modifier(Modifier::BOLD),
379        ),
380        Span::raw("   [q] quit  [r] reset"),
381    ])];
382    let sched = Paragraph::new(sched_text)
383        .block(Block::default().title(" Scheduler ").borders(Borders::ALL));
384    f.render_widget(sched, chunks[4]);
385}