zernel/commands/
bench.rs

1// Copyright (C) 2026 Dyber, Inc. — Proprietary
2
3//! zernel bench — ML benchmark suite
4
5use anyhow::{Context, Result};
6use clap::Subcommand;
7use std::process::Command;
8use std::time::Instant;
9
10#[derive(Subcommand)]
11pub enum BenchCommands {
12    /// Run full benchmark suite
13    All,
14    /// Quick 5-minute smoke test
15    Quick,
16    /// GPU compute throughput
17    Gpu,
18    /// Multi-GPU NCCL communication bandwidth
19    Nccl,
20    /// Dataset loading throughput
21    Dataloader {
22        /// Path to dataset directory
23        #[arg(default_value = ".")]
24        path: String,
25        /// Number of DataLoader workers
26        #[arg(long, default_value = "4")]
27        workers: u32,
28    },
29    /// GPU memory allocation benchmark
30    Memory,
31    /// End-to-end training benchmark
32    E2e {
33        /// Model to benchmark
34        #[arg(long, default_value = "resnet50")]
35        model: String,
36        /// Number of iterations
37        #[arg(long, default_value = "100")]
38        iterations: u32,
39    },
40    /// Generate benchmark report
41    Report,
42}
43
44fn run_python_bench(name: &str, code: &str) -> Result<(f64, String)> {
45    println!("  Running: {name}...");
46    let start = Instant::now();
47    let output = Command::new("python3")
48        .args(["-c", code])
49        .output()
50        .with_context(|| format!("failed to run benchmark: {name}"))?;
51    let elapsed = start.elapsed().as_secs_f64();
52    let stdout = String::from_utf8_lossy(&output.stdout).to_string();
53    let stderr = String::from_utf8_lossy(&output.stderr).to_string();
54
55    if !output.status.success() {
56        println!("    SKIP ({stderr})");
57        return Ok((0.0, "SKIP".into()));
58    }
59
60    Ok((elapsed, stdout.trim().to_string()))
61}
62
63pub async fn run(cmd: BenchCommands) -> Result<()> {
64    match cmd {
65        BenchCommands::All | BenchCommands::Quick => {
66            let quick = matches!(cmd, BenchCommands::Quick);
67            println!(
68                "Zernel ML Benchmark Suite {}",
69                if quick { "(Quick)" } else { "(Full)" }
70            );
71            println!("{}", "=".repeat(60));
72            println!();
73
74            let mut results = Vec::new();
75
76            // GPU Info
77            if let Ok(output) = Command::new("nvidia-smi")
78                .args([
79                    "--query-gpu=name,memory.total,driver_version",
80                    "--format=csv,noheader",
81                ])
82                .output()
83            {
84                let info = String::from_utf8_lossy(&output.stdout);
85                println!("Hardware: {}", info.trim());
86                println!();
87            }
88
89            // 1. GPU FLOPS
90            println!("[1/5] GPU Compute Throughput");
91            let iters = if quick { 100 } else { 1000 };
92            let (_t, out) = run_python_bench(
93                "matmul",
94                &format!(
95                    "import torch; torch.cuda.synchronize(); import time; \
96                 a=torch.randn(4096,4096,device='cuda'); b=torch.randn(4096,4096,device='cuda'); \
97                 torch.cuda.synchronize(); t0=time.time(); \
98                 [torch.mm(a,b) for _ in range({iters})]; \
99                 torch.cuda.synchronize(); t1=time.time(); \
100                 tflops=2*4096**3*{iters}/((t1-t0)*1e12); \
101                 print(f'{{tflops:.1f}} TFLOPS ({{(t1-t0):.2f}}s for {iters} matmuls)')"
102                ),
103            )?;
104            results.push(("GPU Compute", out.clone()));
105            println!("    {out}");
106            println!();
107
108            // 2. GPU Memory Bandwidth
109            println!("[2/5] GPU Memory Bandwidth");
110            let (_, out) = run_python_bench(
111                "membw",
112                "import torch, time; \
113                 size=1024*1024*256; a=torch.randn(size,device='cuda'); \
114                 torch.cuda.synchronize(); t0=time.time(); \
115                 [a.clone() for _ in range(100)]; \
116                 torch.cuda.synchronize(); t1=time.time(); \
117                 bw=size*4*100/((t1-t0)*1e9); \
118                 print(f'{bw:.0f} GB/s ({(t1-t0):.2f}s)')",
119            )?;
120            results.push(("Memory BW", out.clone()));
121            println!("    {out}");
122            println!();
123
124            // 3. NCCL (multi-GPU only)
125            println!("[3/5] NCCL Multi-GPU Communication");
126            let (_, out) = run_python_bench(
127                "nccl",
128                "import torch, time; \
129                 n=torch.cuda.device_count(); \
130                 if n<2: print(f'SKIP (only {n} GPU)'); exit(); \
131                 import torch.distributed as dist; \
132                 print(f'{n} GPUs detected — NCCL bench requires torchrun')",
133            )?;
134            results.push(("NCCL", out.clone()));
135            println!("    {out}");
136            println!();
137
138            // 4. DataLoader
139            println!("[4/5] DataLoader Throughput");
140            let (_, out) = run_python_bench(
141                "dataloader",
142                "import torch, time; from torch.utils.data import DataLoader, TensorDataset; \
143                 ds=TensorDataset(torch.randn(10000,3,224,224),torch.randint(0,1000,(10000,))); \
144                 dl=DataLoader(ds,batch_size=64,num_workers=4,pin_memory=True); \
145                 t0=time.time(); \
146                 for batch in dl: pass; \
147                 t1=time.time(); \
148                 print(f'{10000/(t1-t0):.0f} samples/s ({(t1-t0):.2f}s for 10K samples)')",
149            )?;
150            results.push(("DataLoader", out.clone()));
151            println!("    {out}");
152            println!();
153
154            // 5. Training step
155            println!("[5/5] Training Step Latency");
156            let (_, out) = run_python_bench(
157                "trainstep",
158                "import torch, torch.nn as nn, time; \
159                 model=nn.Linear(4096,4096).cuda(); opt=torch.optim.Adam(model.parameters()); \
160                 x=torch.randn(256,4096,device='cuda'); \
161                 torch.cuda.synchronize(); t0=time.time(); \
162                 for _ in range(100): \
163                     loss=model(x).sum(); loss.backward(); opt.step(); opt.zero_grad(); \
164                 torch.cuda.synchronize(); t1=time.time(); \
165                 print(f'{(t1-t0)/100*1000:.1f} ms/step ({(t1-t0):.2f}s for 100 steps)')",
166            )?;
167            results.push(("Train Step", out.clone()));
168            println!("    {out}");
169
170            // Summary
171            println!();
172            println!("{}", "=".repeat(60));
173            println!("Summary");
174            println!("{}", "-".repeat(60));
175            for (name, result) in &results {
176                println!("  {:<20} {}", name, result);
177            }
178            println!();
179        }
180
181        BenchCommands::Gpu => {
182            println!("GPU Compute Benchmark");
183            let (_, out) = run_python_bench(
184                "matmul-full",
185                "import torch, time; sizes=[1024,2048,4096,8192]; \
186                 for s in sizes: \
187                     a=torch.randn(s,s,device='cuda'); b=torch.randn(s,s,device='cuda'); \
188                     torch.cuda.synchronize(); t0=time.time(); \
189                     [torch.mm(a,b) for _ in range(100)]; \
190                     torch.cuda.synchronize(); t1=time.time(); \
191                     tflops=2*s**3*100/((t1-t0)*1e12); \
192                     print(f'  {s}x{s}: {tflops:.1f} TFLOPS ({(t1-t0)/100*1000:.1f} ms/op)')",
193            )?;
194            println!("{out}");
195        }
196
197        BenchCommands::Nccl => {
198            println!("NCCL Benchmark — requires multi-GPU + torchrun");
199            println!("Run: torchrun --nproc_per_node=auto -m torch.distributed.run nccl_bench.py");
200        }
201
202        BenchCommands::Dataloader { path, workers } => {
203            println!("DataLoader Benchmark (path: {path}, workers: {workers})");
204            let (_, out) = run_python_bench(
205                "dl-bench",
206                &format!(
207                "import torch, time, os; from torch.utils.data import DataLoader, TensorDataset; \
208                 n=50000; ds=TensorDataset(torch.randn(n,3,224,224),torch.randint(0,1000,(n,))); \
209                 for w in [0,1,2,4,{workers}]: \
210                     dl=DataLoader(ds,batch_size=64,num_workers=w,pin_memory=True); \
211                     t0=time.time(); \
212                     for b in dl: pass; \
213                     t1=time.time(); \
214                     print(f'  workers={{w}}: {{n/(t1-t0):.0f}} samples/s')"
215            ),
216            )?;
217            println!("{out}");
218        }
219
220        BenchCommands::Memory => {
221            println!("GPU Memory Allocation Benchmark");
222            let (_, out) = run_python_bench(
223                "mem-bench",
224                "import torch, time; \
225                 sizes=[1,10,100,1000]; \
226                 for mb in sizes: \
227                     n=mb*1024*256; t0=time.time(); \
228                     [torch.empty(n,device='cuda') for _ in range(100)]; \
229                     torch.cuda.synchronize(); t1=time.time(); \
230                     print(f'  {mb}MB alloc: {(t1-t0)/100*1e6:.0f} us/alloc')",
231            )?;
232            println!("{out}");
233        }
234
235        BenchCommands::E2e { model, iterations } => {
236            println!("End-to-End Training Benchmark: {model} ({iterations} iterations)");
237            let (_, out) = run_python_bench("e2e", &format!(
238                "import torch, torchvision.models as m, time; \
239                 model=getattr(m,'{model}')().cuda(); \
240                 opt=torch.optim.SGD(model.parameters(),lr=0.01); \
241                 x=torch.randn(32,3,224,224,device='cuda'); t=torch.randint(0,1000,(32,),device='cuda'); \
242                 loss_fn=torch.nn.CrossEntropyLoss(); \
243                 torch.cuda.synchronize(); t0=time.time(); \
244                 for i in range({iterations}): \
245                     out=model(x); loss=loss_fn(out,t); loss.backward(); opt.step(); opt.zero_grad(); \
246                 torch.cuda.synchronize(); t1=time.time(); \
247                 ips=32*{iterations}/(t1-t0); \
248                 print(f'{model}: {{ips:.0f}} images/s ({{(t1-t0)/{iterations}*1000:.1f}} ms/step)')"
249            ))?;
250            println!("{out}");
251        }
252
253        BenchCommands::Report => {
254            println!("Run: zernel bench all > benchmark-report.txt");
255            println!("Full HTML report generation coming in a future release.");
256        }
257    }
258    Ok(())
259}