zernel_ebpf/
simulator.rs

1// Copyright (C) 2026 Dyber, Inc. — GPL-2.0
2
3//! Generates simulated ML workload telemetry for development and demos.
4//! Used when BPF probes are unavailable (non-Linux or no root).
5
6use crate::aggregation::AggregatedMetrics;
7use std::sync::Arc;
8use tokio::sync::RwLock;
9use tracing::debug;
10
11/// Simulates realistic ML telemetry data that cycles through training phases.
12pub async fn run_simulator(metrics: Arc<RwLock<AggregatedMetrics>>, interval_ms: u64) {
13    let mut tick = 0u64;
14    let mut interval = tokio::time::interval(tokio::time::Duration::from_millis(interval_ms));
15
16    // Simulated GPU memory (4 GPUs, 80GB each)
17    let gpu_mem_base: [u64; 4] = [
18        78 * 1024 * 1024 * 1024,
19        77 * 1024 * 1024 * 1024,
20        79 * 1024 * 1024 * 1024,
21        78 * 1024 * 1024 * 1024,
22    ];
23
24    loop {
25        interval.tick().await;
26        tick += 1;
27
28        let mut m = metrics.write().await;
29        let now_ms = std::time::SystemTime::now()
30            .duration_since(std::time::UNIX_EPOCH)
31            .unwrap_or_default()
32            .as_millis() as u64;
33        m.last_update_ms = now_ms;
34
35        // Simulate GPU memory with small fluctuations
36        for (i, base) in gpu_mem_base.iter().enumerate() {
37            let jitter = ((tick * (i as u64 + 1) * 7) % 500) * 1024 * 1024;
38            let used = base + jitter;
39            m.record_gpu_mem(1000, i as u32, used, used);
40        }
41
42        // Simulate CUDA launch latency (100-500us, with occasional spikes)
43        let base_latency_ns = 142_000u64; // 142us baseline
44        let spike = if tick.is_multiple_of(50) { 500_000 } else { 0 };
45        let jitter = ((tick * 31) % 200) * 1000;
46        m.record_cuda_latency(1000, base_latency_ns + jitter + spike);
47
48        // Simulate NCCL all-reduce (every ~10 ticks, 30-70ms)
49        if tick.is_multiple_of(10) {
50            let duration_ns = 34_000_000 + ((tick * 17) % 30) * 1_000_000;
51            m.record_nccl("all_reduce", duration_ns);
52        }
53
54        // Simulate DataLoader wait (5-15ms)
55        let wait_ns = 8_000_000 + ((tick * 13) % 7) * 1_000_000;
56        m.record_dataloader_wait(1000, wait_ns);
57
58        debug!(tick, "simulator update");
59    }
60}