1use crate::telemetry::client::{self, TelemetryClient, TelemetrySnapshot};
4use anyhow::Result;
5use crossterm::{
6 event::{self, Event, KeyCode, KeyEventKind},
7 execute,
8 terminal::{disable_raw_mode, enable_raw_mode, EnterAlternateScreen, LeaveAlternateScreen},
9};
10use ratatui::{
11 backend::CrosstermBackend,
12 layout::{Constraint, Direction, Layout},
13 style::{Color, Modifier, Style},
14 text::{Line, Span},
15 widgets::{Block, Borders, Gauge, Paragraph},
16 Terminal,
17};
18use std::io;
19use tracing::info;
20
21struct DashboardState {
23 mode: DashboardMode,
24 tick: u64,
25 gpus: Vec<GpuInfo>,
26 loss: f64,
27 step: u64,
28 total_steps: u64,
29 cuda_p50_us: f64,
30 cuda_p99_us: f64,
31 nccl_p50_ms: f64,
32 nccl_p99_ms: f64,
33 dataloader_ms: f64,
34 pcie_gbps: f64,
35 phase: String,
36}
37
38enum DashboardMode {
39 Demo,
40 Live,
41}
42
43struct GpuInfo {
44 id: u32,
45 util: u8,
46 mem_used_gb: f64,
47 mem_total_gb: f64,
48}
49
50impl DashboardState {
51 fn new_demo() -> Self {
52 Self {
53 mode: DashboardMode::Demo,
54 tick: 0,
55 gpus: vec![
56 GpuInfo {
57 id: 0,
58 util: 94,
59 mem_used_gb: 78.2,
60 mem_total_gb: 80.0,
61 },
62 GpuInfo {
63 id: 1,
64 util: 91,
65 mem_used_gb: 77.8,
66 mem_total_gb: 80.0,
67 },
68 GpuInfo {
69 id: 2,
70 util: 96,
71 mem_used_gb: 79.1,
72 mem_total_gb: 80.0,
73 },
74 GpuInfo {
75 id: 3,
76 util: 93,
77 mem_used_gb: 78.9,
78 mem_total_gb: 80.0,
79 },
80 ],
81 loss: 1.8,
82 step: 0,
83 total_steps: 10000,
84 cuda_p50_us: 142.0,
85 cuda_p99_us: 891.0,
86 nccl_p50_ms: 34.0,
87 nccl_p99_ms: 67.0,
88 dataloader_ms: 8.0,
89 pcie_gbps: 31.2,
90 phase: "GpuCompute".into(),
91 }
92 }
93
94 fn new_live() -> Self {
95 Self {
96 mode: DashboardMode::Live,
97 tick: 0,
98 gpus: Vec::new(),
99 loss: 0.0,
100 step: 0,
101 total_steps: 0,
102 cuda_p50_us: 0.0,
103 cuda_p99_us: 0.0,
104 nccl_p50_ms: 0.0,
105 nccl_p99_ms: 0.0,
106 dataloader_ms: 0.0,
107 pcie_gbps: 0.0,
108 phase: "Unknown".into(),
109 }
110 }
111
112 fn apply_snapshot(&mut self, snap: &TelemetrySnapshot) {
114 self.tick += 1;
115 self.cuda_p50_us = snap.cuda_latency_p50_us;
116 self.cuda_p99_us = snap.cuda_latency_p99_us;
117 self.nccl_p50_ms = snap.nccl_allreduce_p50_ms;
118 self.nccl_p99_ms = snap.nccl_allreduce_p99_ms;
119 self.dataloader_ms = snap.dataloader_wait_p50_ms;
120
121 self.gpus.clear();
123 for (i, entry) in snap.gpu_utilization.iter().enumerate() {
124 let total_gb = if entry.peak_bytes > 0 {
125 entry.peak_bytes as f64 / (1024.0 * 1024.0 * 1024.0)
126 } else {
127 80.0
128 };
129 let used_gb = entry.current_bytes as f64 / (1024.0 * 1024.0 * 1024.0);
130 let util = if entry.peak_bytes > 0 {
131 ((entry.current_bytes as f64 / entry.peak_bytes as f64) * 100.0) as u8
132 } else {
133 0
134 };
135 self.gpus.push(GpuInfo {
136 id: i as u32,
137 util,
138 mem_used_gb: used_gb,
139 mem_total_gb: total_gb,
140 });
141 }
142 }
143
144 fn demo_tick(&mut self) {
146 self.tick += 1;
147 self.step = (self.step + 3).min(self.total_steps);
148 self.loss = (1.8 * (-0.0002 * self.step as f64).exp()).max(0.3);
149
150 for gpu in &mut self.gpus {
151 let jitter = ((self.tick * (gpu.id as u64 + 1) * 7) % 6) as i8 - 3;
152 gpu.util = (gpu.util as i8 + jitter).clamp(85, 99) as u8;
153 }
154
155 let phase_cycle = self.tick % 20;
156 self.phase = match phase_cycle {
157 0..=2 => "DataLoading",
158 3..=14 => "GpuCompute",
159 15..=16 => "NcclCollective",
160 17..=18 => "OptimizerStep",
161 _ => "GpuCompute",
162 }
163 .into();
164 }
165
166 fn mode_label(&self) -> &str {
167 match self.mode {
168 DashboardMode::Demo => " [DEMO] ",
169 DashboardMode::Live => " [LIVE] ",
170 }
171 }
172}
173
174pub async fn run() -> Result<()> {
176 info!("starting watch dashboard");
177
178 let port = client::ws_port();
180 let client = TelemetryClient::new("127.0.0.1", port);
181 let (mut state, mut rx) = match client.try_connect().await {
182 Some(rx) => {
183 info!(port, "connected to zerneld");
184 (DashboardState::new_live(), Some(rx))
185 }
186 None => {
187 info!("zerneld not available, using demo mode");
188 (DashboardState::new_demo(), None)
189 }
190 };
191
192 enable_raw_mode()?;
194 let mut stdout = io::stdout();
195 execute!(stdout, EnterAlternateScreen)?;
196 let backend = CrosstermBackend::new(stdout);
197 let mut terminal = Terminal::new(backend)?;
198
199 loop {
201 match &mut rx {
203 Some(receiver) => {
204 while let Ok(snap) = receiver.try_recv() {
206 state.apply_snapshot(&snap);
207 }
208 }
209 None => {
210 state.demo_tick();
211 }
212 }
213
214 terminal.draw(|f| render_dashboard(f, &state))?;
215
216 if event::poll(std::time::Duration::from_millis(166))? {
218 if let Event::Key(key) = event::read()? {
219 if key.kind == KeyEventKind::Press {
220 match key.code {
221 KeyCode::Char('q') | KeyCode::Esc => break,
222 KeyCode::Char('r') => {
223 state = match state.mode {
224 DashboardMode::Demo => DashboardState::new_demo(),
225 DashboardMode::Live => DashboardState::new_live(),
226 };
227 }
228 _ => {}
229 }
230 }
231 }
232 }
233 }
234
235 disable_raw_mode()?;
237 execute!(terminal.backend_mut(), LeaveAlternateScreen)?;
238 terminal.show_cursor()?;
239
240 Ok(())
241}
242
243fn render_dashboard(f: &mut ratatui::Frame, state: &DashboardState) {
244 let chunks = Layout::default()
245 .direction(Direction::Vertical)
246 .constraints([
247 Constraint::Length(3),
248 Constraint::Length(6),
249 Constraint::Length(5),
250 Constraint::Length(6),
251 Constraint::Min(3),
252 ])
253 .split(f.area());
254
255 let elapsed_mins = state.tick / 6;
257 let title = Paragraph::new(Line::from(vec![
258 Span::styled(
259 " Zernel Watch ",
260 Style::default()
261 .fg(Color::Cyan)
262 .add_modifier(Modifier::BOLD),
263 ),
264 Span::styled(
265 state.mode_label(),
266 Style::default().fg(match state.mode {
267 DashboardMode::Demo => Color::Yellow,
268 DashboardMode::Live => Color::Green,
269 }),
270 ),
271 Span::raw(format!(
272 " step: {}/{} | elapsed: {}m",
273 state.step, state.total_steps, elapsed_mins
274 )),
275 ]))
276 .block(Block::default().borders(Borders::ALL));
277 f.render_widget(title, chunks[0]);
278
279 if !state.gpus.is_empty() {
281 let gpu_pct = 100 / state.gpus.len().max(1) as u16;
282 let gpu_constraints: Vec<Constraint> = state
283 .gpus
284 .iter()
285 .map(|_| Constraint::Percentage(gpu_pct))
286 .collect();
287 let gpu_chunks = Layout::default()
288 .direction(Direction::Horizontal)
289 .constraints(gpu_constraints)
290 .split(chunks[1]);
291
292 for (i, gpu) in state.gpus.iter().enumerate() {
293 let color = if gpu.util > 90 {
294 Color::Green
295 } else if gpu.util > 70 {
296 Color::Yellow
297 } else {
298 Color::Red
299 };
300 let gauge = Gauge::default()
301 .block(
302 Block::default()
303 .title(format!(
304 "GPU {} | {:.1}/{:.1} GB",
305 gpu.id, gpu.mem_used_gb, gpu.mem_total_gb
306 ))
307 .borders(Borders::ALL),
308 )
309 .gauge_style(Style::default().fg(color))
310 .percent(gpu.util as u16)
311 .label(format!("{}%", gpu.util));
312 f.render_widget(gauge, gpu_chunks[i]);
313 }
314 } else {
315 let msg = Paragraph::new(" Waiting for GPU data...")
316 .block(Block::default().title(" GPUs ").borders(Borders::ALL));
317 f.render_widget(msg, chunks[1]);
318 }
319
320 let loss_str = format!("{:.4}", state.loss);
322 let progress = if state.total_steps > 0 {
323 state.step as f64 / state.total_steps as f64
324 } else {
325 0.0
326 };
327
328 let metrics_text = vec![
329 Line::from(vec![
330 Span::styled(" loss: ", Style::default().fg(Color::Yellow)),
331 Span::raw(&loss_str),
332 Span::raw(format!(" step: {}/{}", state.step, state.total_steps)),
333 ]),
334 Line::from(vec![
335 Span::styled(" progress: ", Style::default().fg(Color::Yellow)),
336 Span::raw(format!("{:.1}%", progress * 100.0)),
337 ]),
338 ];
339 let metrics = Paragraph::new(metrics_text).block(
340 Block::default()
341 .title(" Training Metrics ")
342 .borders(Borders::ALL),
343 );
344 f.render_widget(metrics, chunks[2]);
345
346 let telem_text = vec![
348 Line::from(format!(
349 " CUDA launch: p50={:.0}us p99={:.0}us DataLoader wait: p50={:.0}ms",
350 state.cuda_p50_us, state.cuda_p99_us, state.dataloader_ms
351 )),
352 Line::from(format!(
353 " NCCL allreduce: p50={:.0}ms p99={:.0}ms PCIe BW: {:.1} GB/s",
354 state.nccl_p50_ms, state.nccl_p99_ms, state.pcie_gbps
355 )),
356 ];
357 let telem = Paragraph::new(telem_text).block(
358 Block::default()
359 .title(" eBPF Telemetry ")
360 .borders(Borders::ALL),
361 );
362 f.render_widget(telem, chunks[3]);
363
364 let phase_color = match state.phase.as_str() {
366 "GpuCompute" => Color::Green,
367 "DataLoading" => Color::Yellow,
368 "NcclCollective" => Color::Cyan,
369 "OptimizerStep" => Color::Magenta,
370 _ => Color::White,
371 };
372 let sched_text = vec![Line::from(vec![
373 Span::raw(" Phase: "),
374 Span::styled(
375 &state.phase,
376 Style::default()
377 .fg(phase_color)
378 .add_modifier(Modifier::BOLD),
379 ),
380 Span::raw(" [q] quit [r] reset"),
381 ])];
382 let sched = Paragraph::new(sched_text)
383 .block(Block::default().title(" Scheduler ").borders(Borders::ALL));
384 f.render_widget(sched, chunks[4]);
385}