1use anyhow::{Context, Result};
6use clap::Subcommand;
7use std::process::Command;
8use std::time::Instant;
9
10#[derive(Subcommand)]
11pub enum BenchCommands {
12 All,
14 Quick,
16 Gpu,
18 Nccl,
20 Dataloader {
22 #[arg(default_value = ".")]
24 path: String,
25 #[arg(long, default_value = "4")]
27 workers: u32,
28 },
29 Memory,
31 E2e {
33 #[arg(long, default_value = "resnet50")]
35 model: String,
36 #[arg(long, default_value = "100")]
38 iterations: u32,
39 },
40 Report,
42}
43
44fn run_python_bench(name: &str, code: &str) -> Result<(f64, String)> {
45 println!(" Running: {name}...");
46 let start = Instant::now();
47 let output = Command::new("python3")
48 .args(["-c", code])
49 .output()
50 .with_context(|| format!("failed to run benchmark: {name}"))?;
51 let elapsed = start.elapsed().as_secs_f64();
52 let stdout = String::from_utf8_lossy(&output.stdout).to_string();
53 let stderr = String::from_utf8_lossy(&output.stderr).to_string();
54
55 if !output.status.success() {
56 println!(" SKIP ({stderr})");
57 return Ok((0.0, "SKIP".into()));
58 }
59
60 Ok((elapsed, stdout.trim().to_string()))
61}
62
63pub async fn run(cmd: BenchCommands) -> Result<()> {
64 match cmd {
65 BenchCommands::All | BenchCommands::Quick => {
66 let quick = matches!(cmd, BenchCommands::Quick);
67 println!(
68 "Zernel ML Benchmark Suite {}",
69 if quick { "(Quick)" } else { "(Full)" }
70 );
71 println!("{}", "=".repeat(60));
72 println!();
73
74 let mut results = Vec::new();
75
76 if let Ok(output) = Command::new("nvidia-smi")
78 .args([
79 "--query-gpu=name,memory.total,driver_version",
80 "--format=csv,noheader",
81 ])
82 .output()
83 {
84 let info = String::from_utf8_lossy(&output.stdout);
85 println!("Hardware: {}", info.trim());
86 println!();
87 }
88
89 println!("[1/5] GPU Compute Throughput");
91 let iters = if quick { 100 } else { 1000 };
92 let (_t, out) = run_python_bench(
93 "matmul",
94 &format!(
95 "import torch; torch.cuda.synchronize(); import time; \
96 a=torch.randn(4096,4096,device='cuda'); b=torch.randn(4096,4096,device='cuda'); \
97 torch.cuda.synchronize(); t0=time.time(); \
98 [torch.mm(a,b) for _ in range({iters})]; \
99 torch.cuda.synchronize(); t1=time.time(); \
100 tflops=2*4096**3*{iters}/((t1-t0)*1e12); \
101 print(f'{{tflops:.1f}} TFLOPS ({{(t1-t0):.2f}}s for {iters} matmuls)')"
102 ),
103 )?;
104 results.push(("GPU Compute", out.clone()));
105 println!(" {out}");
106 println!();
107
108 println!("[2/5] GPU Memory Bandwidth");
110 let (_, out) = run_python_bench(
111 "membw",
112 "import torch, time; \
113 size=1024*1024*256; a=torch.randn(size,device='cuda'); \
114 torch.cuda.synchronize(); t0=time.time(); \
115 [a.clone() for _ in range(100)]; \
116 torch.cuda.synchronize(); t1=time.time(); \
117 bw=size*4*100/((t1-t0)*1e9); \
118 print(f'{bw:.0f} GB/s ({(t1-t0):.2f}s)')",
119 )?;
120 results.push(("Memory BW", out.clone()));
121 println!(" {out}");
122 println!();
123
124 println!("[3/5] NCCL Multi-GPU Communication");
126 let (_, out) = run_python_bench(
127 "nccl",
128 "import torch, time; \
129 n=torch.cuda.device_count(); \
130 if n<2: print(f'SKIP (only {n} GPU)'); exit(); \
131 import torch.distributed as dist; \
132 print(f'{n} GPUs detected — NCCL bench requires torchrun')",
133 )?;
134 results.push(("NCCL", out.clone()));
135 println!(" {out}");
136 println!();
137
138 println!("[4/5] DataLoader Throughput");
140 let (_, out) = run_python_bench(
141 "dataloader",
142 "import torch, time; from torch.utils.data import DataLoader, TensorDataset; \
143 ds=TensorDataset(torch.randn(10000,3,224,224),torch.randint(0,1000,(10000,))); \
144 dl=DataLoader(ds,batch_size=64,num_workers=4,pin_memory=True); \
145 t0=time.time(); \
146 for batch in dl: pass; \
147 t1=time.time(); \
148 print(f'{10000/(t1-t0):.0f} samples/s ({(t1-t0):.2f}s for 10K samples)')",
149 )?;
150 results.push(("DataLoader", out.clone()));
151 println!(" {out}");
152 println!();
153
154 println!("[5/5] Training Step Latency");
156 let (_, out) = run_python_bench(
157 "trainstep",
158 "import torch, torch.nn as nn, time; \
159 model=nn.Linear(4096,4096).cuda(); opt=torch.optim.Adam(model.parameters()); \
160 x=torch.randn(256,4096,device='cuda'); \
161 torch.cuda.synchronize(); t0=time.time(); \
162 for _ in range(100): \
163 loss=model(x).sum(); loss.backward(); opt.step(); opt.zero_grad(); \
164 torch.cuda.synchronize(); t1=time.time(); \
165 print(f'{(t1-t0)/100*1000:.1f} ms/step ({(t1-t0):.2f}s for 100 steps)')",
166 )?;
167 results.push(("Train Step", out.clone()));
168 println!(" {out}");
169
170 println!();
172 println!("{}", "=".repeat(60));
173 println!("Summary");
174 println!("{}", "-".repeat(60));
175 for (name, result) in &results {
176 println!(" {:<20} {}", name, result);
177 }
178 println!();
179 }
180
181 BenchCommands::Gpu => {
182 println!("GPU Compute Benchmark");
183 let (_, out) = run_python_bench(
184 "matmul-full",
185 "import torch, time; sizes=[1024,2048,4096,8192]; \
186 for s in sizes: \
187 a=torch.randn(s,s,device='cuda'); b=torch.randn(s,s,device='cuda'); \
188 torch.cuda.synchronize(); t0=time.time(); \
189 [torch.mm(a,b) for _ in range(100)]; \
190 torch.cuda.synchronize(); t1=time.time(); \
191 tflops=2*s**3*100/((t1-t0)*1e12); \
192 print(f' {s}x{s}: {tflops:.1f} TFLOPS ({(t1-t0)/100*1000:.1f} ms/op)')",
193 )?;
194 println!("{out}");
195 }
196
197 BenchCommands::Nccl => {
198 println!("NCCL Benchmark — requires multi-GPU + torchrun");
199 println!("Run: torchrun --nproc_per_node=auto -m torch.distributed.run nccl_bench.py");
200 }
201
202 BenchCommands::Dataloader { path, workers } => {
203 println!("DataLoader Benchmark (path: {path}, workers: {workers})");
204 let (_, out) = run_python_bench(
205 "dl-bench",
206 &format!(
207 "import torch, time, os; from torch.utils.data import DataLoader, TensorDataset; \
208 n=50000; ds=TensorDataset(torch.randn(n,3,224,224),torch.randint(0,1000,(n,))); \
209 for w in [0,1,2,4,{workers}]: \
210 dl=DataLoader(ds,batch_size=64,num_workers=w,pin_memory=True); \
211 t0=time.time(); \
212 for b in dl: pass; \
213 t1=time.time(); \
214 print(f' workers={{w}}: {{n/(t1-t0):.0f}} samples/s')"
215 ),
216 )?;
217 println!("{out}");
218 }
219
220 BenchCommands::Memory => {
221 println!("GPU Memory Allocation Benchmark");
222 let (_, out) = run_python_bench(
223 "mem-bench",
224 "import torch, time; \
225 sizes=[1,10,100,1000]; \
226 for mb in sizes: \
227 n=mb*1024*256; t0=time.time(); \
228 [torch.empty(n,device='cuda') for _ in range(100)]; \
229 torch.cuda.synchronize(); t1=time.time(); \
230 print(f' {mb}MB alloc: {(t1-t0)/100*1e6:.0f} us/alloc')",
231 )?;
232 println!("{out}");
233 }
234
235 BenchCommands::E2e { model, iterations } => {
236 println!("End-to-End Training Benchmark: {model} ({iterations} iterations)");
237 let (_, out) = run_python_bench("e2e", &format!(
238 "import torch, torchvision.models as m, time; \
239 model=getattr(m,'{model}')().cuda(); \
240 opt=torch.optim.SGD(model.parameters(),lr=0.01); \
241 x=torch.randn(32,3,224,224,device='cuda'); t=torch.randint(0,1000,(32,),device='cuda'); \
242 loss_fn=torch.nn.CrossEntropyLoss(); \
243 torch.cuda.synchronize(); t0=time.time(); \
244 for i in range({iterations}): \
245 out=model(x); loss=loss_fn(out,t); loss.backward(); opt.step(); opt.zero_grad(); \
246 torch.cuda.synchronize(); t1=time.time(); \
247 ips=32*{iterations}/(t1-t0); \
248 print(f'{model}: {{ips:.0f}} images/s ({{(t1-t0)/{iterations}*1000:.1f}} ms/step)')"
249 ))?;
250 println!("{out}");
251 }
252
253 BenchCommands::Report => {
254 println!("Run: zernel bench all > benchmark-report.txt");
255 println!("Full HTML report generation coming in a future release.");
256 }
257 }
258 Ok(())
259}