1use anyhow::Result;
10use clap::Subcommand;
11use std::process::Command;
12
13#[derive(Subcommand)]
14pub enum TuneCommands {
15 Analyze,
17 Apply {
19 #[arg(long)]
21 dry_run: bool,
22 },
23 Diff,
25 Export {
27 #[arg(long, default_value = "zernel-tuned.conf")]
29 output: String,
30 },
31}
32
33struct HardwareProfile {
34 gpu_count: u32,
35 gpu_memory_mb: u64,
36 ram_mb: u64,
37 cpu_cores: u32,
38 numa_nodes: u32,
39 nvme_count: u32,
40 network_speed_mbps: u32,
41 has_infiniband: bool,
42}
43
44impl HardwareProfile {
45 fn detect() -> Self {
46 let gpu_count = Command::new("nvidia-smi")
47 .args(["--query-gpu=count", "--format=csv,noheader"])
48 .output()
49 .ok()
50 .and_then(|o| {
51 String::from_utf8_lossy(&o.stdout)
52 .trim()
53 .lines()
54 .next()
55 .and_then(|s| s.parse().ok())
56 })
57 .unwrap_or(0);
58
59 let gpu_memory_mb = Command::new("nvidia-smi")
60 .args(["--query-gpu=memory.total", "--format=csv,noheader,nounits"])
61 .output()
62 .ok()
63 .and_then(|o| {
64 String::from_utf8_lossy(&o.stdout)
65 .trim()
66 .lines()
67 .next()
68 .and_then(|s| s.parse().ok())
69 })
70 .unwrap_or(0);
71
72 let ram_mb = {
73 #[cfg(target_os = "linux")]
74 {
75 std::fs::read_to_string("/proc/meminfo")
76 .ok()
77 .and_then(|s| {
78 s.lines()
79 .find(|l| l.starts_with("MemTotal:"))
80 .and_then(|l| l.split_whitespace().nth(1))
81 .and_then(|s| s.parse::<u64>().ok())
82 .map(|kb| kb / 1024)
83 })
84 .unwrap_or(0)
85 }
86 #[cfg(not(target_os = "linux"))]
87 {
88 0u64
89 }
90 };
91
92 let cpu_cores = std::thread::available_parallelism()
93 .map(|n| n.get() as u32)
94 .unwrap_or(1);
95
96 #[cfg(target_os = "linux")]
97 let numa_nodes = std::fs::read_dir("/sys/devices/system/node")
98 .map(|entries| {
99 entries
100 .flatten()
101 .filter(|e| e.file_name().to_string_lossy().starts_with("node"))
102 .count() as u32
103 })
104 .unwrap_or(1);
105 #[cfg(not(target_os = "linux"))]
106 let numa_nodes = 1u32;
107
108 #[cfg(target_os = "linux")]
109 let nvme_count = std::fs::read_dir("/sys/class/nvme")
110 .map(|entries| entries.flatten().count() as u32)
111 .unwrap_or(0);
112 #[cfg(not(target_os = "linux"))]
113 let nvme_count = 0u32;
114
115 let has_infiniband = std::path::Path::new("/sys/class/infiniband").exists();
116
117 let network_speed_mbps = if has_infiniband { 100000 } else { 25000 };
118
119 Self {
120 gpu_count,
121 gpu_memory_mb,
122 ram_mb,
123 cpu_cores,
124 numa_nodes,
125 nvme_count,
126 network_speed_mbps,
127 has_infiniband,
128 }
129 }
130}
131
132struct TuningParam {
133 key: String,
134 value: String,
135 reason: String,
136}
137
138fn generate_params(hw: &HardwareProfile) -> Vec<TuningParam> {
139 let mut params = Vec::new();
140
141 params.push(TuningParam {
143 key: "vm.swappiness".into(),
144 value: "0".into(),
145 reason: "ML servers should never swap — kills GPU performance".into(),
146 });
147
148 params.push(TuningParam {
149 key: "vm.overcommit_memory".into(),
150 value: "1".into(),
151 reason: "PyTorch requires memory overcommit for large allocations".into(),
152 });
153
154 let hugepages = if hw.gpu_memory_mb > 40000 {
156 2048 } else if hw.gpu_memory_mb > 16000 {
158 1024 } else {
160 512 };
162 params.push(TuningParam {
163 key: "vm.nr_hugepages".into(),
164 value: hugepages.to_string(),
165 reason: format!(
166 "Pre-allocate {}MB huge pages for GPU DMA (based on {}MB GPU memory)",
167 hugepages * 2,
168 hw.gpu_memory_mb
169 ),
170 });
171
172 let dirty_ratio = if hw.ram_mb > 256000 {
174 60
175 } else if hw.ram_mb > 128000 {
176 40
177 } else {
178 20
179 };
180 params.push(TuningParam {
181 key: "vm.dirty_ratio".into(),
182 value: dirty_ratio.to_string(),
183 reason: format!(
184 "Allow {}% dirty pages for large dataset writes ({}GB RAM)",
185 dirty_ratio,
186 hw.ram_mb / 1024
187 ),
188 });
189
190 params.push(TuningParam {
191 key: "vm.dirty_background_ratio".into(),
192 value: "10".into(),
193 reason: "Start background writeback at 10%".into(),
194 });
195
196 let net_buf = if hw.has_infiniband || hw.network_speed_mbps >= 100000 {
198 268435456 } else if hw.network_speed_mbps >= 25000 {
200 134217728 } else {
202 67108864 };
204
205 params.push(TuningParam {
206 key: "net.core.rmem_max".into(),
207 value: net_buf.to_string(),
208 reason: format!(
209 "{}MB receive buffer for {}Gbps network (NCCL distributed training)",
210 net_buf / 1048576,
211 hw.network_speed_mbps / 1000
212 ),
213 });
214
215 params.push(TuningParam {
216 key: "net.core.wmem_max".into(),
217 value: net_buf.to_string(),
218 reason: format!("{}MB send buffer", net_buf / 1048576),
219 });
220
221 params.push(TuningParam {
222 key: "net.ipv4.tcp_rmem".into(),
223 value: format!("4096 87380 {net_buf}"),
224 reason: "TCP receive buffer auto-tuning range".into(),
225 });
226
227 params.push(TuningParam {
228 key: "net.ipv4.tcp_wmem".into(),
229 value: format!("4096 65536 {net_buf}"),
230 reason: "TCP send buffer auto-tuning range".into(),
231 });
232
233 params.push(TuningParam {
234 key: "net.ipv4.tcp_congestion_control".into(),
235 value: "bbr".into(),
236 reason: "BBR congestion control — better for datacenter workloads".into(),
237 });
238
239 let backlog = if hw.gpu_count > 4 { 500000 } else { 250000 };
240 params.push(TuningParam {
241 key: "net.core.netdev_max_backlog".into(),
242 value: backlog.to_string(),
243 reason: format!(
244 "Network backlog for {} GPUs (NCCL generates bursty traffic)",
245 hw.gpu_count
246 ),
247 });
248
249 if hw.numa_nodes > 1 {
251 params.push(TuningParam {
252 key: "kernel.numa_balancing".into(),
253 value: "1".into(),
254 reason: format!("NUMA auto-balancing for {} NUMA nodes", hw.numa_nodes),
255 });
256 }
257
258 let file_max = if hw.gpu_count > 4 { 4194304 } else { 2097152 };
260 params.push(TuningParam {
261 key: "fs.file-max".into(),
262 value: file_max.to_string(),
263 reason: format!(
264 "{} max file handles (DataLoader workers + {} GPUs)",
265 file_max, hw.gpu_count
266 ),
267 });
268
269 params.push(TuningParam {
270 key: "fs.inotify.max_user_watches".into(),
271 value: "1048576".into(),
272 reason: "High inotify watches for dataset monitoring".into(),
273 });
274
275 if hw.cpu_cores > 16 {
277 params.push(TuningParam {
278 key: "kernel.sched_migration_cost_ns".into(),
279 value: "5000000".into(),
280 reason: format!(
281 "Reduce scheduler migration cost on {} cores (keep DataLoader threads on-core)",
282 hw.cpu_cores
283 ),
284 });
285 }
286
287 params
288}
289
290pub async fn run(cmd: TuneCommands) -> Result<()> {
291 match cmd {
292 TuneCommands::Analyze => {
293 let hw = HardwareProfile::detect();
294
295 println!("Zernel Hardware Analysis");
296 println!("{}", "=".repeat(60));
297 println!();
298 println!("Detected Hardware:");
299 println!(
300 " GPUs: {} ({}MB each)",
301 hw.gpu_count, hw.gpu_memory_mb
302 );
303 println!(" RAM: {} GB", hw.ram_mb / 1024);
304 println!(" CPU cores: {}", hw.cpu_cores);
305 println!(" NUMA nodes: {}", hw.numa_nodes);
306 println!(" NVMe drives: {}", hw.nvme_count);
307 println!(
308 " Network: {}Gbps {}",
309 hw.network_speed_mbps / 1000,
310 if hw.has_infiniband {
311 "(InfiniBand)"
312 } else {
313 ""
314 }
315 );
316 println!();
317
318 let params = generate_params(&hw);
319 println!("Recommended Parameters ({} total):", params.len());
320 println!("{}", "-".repeat(60));
321 for p in ¶ms {
322 println!(" {} = {}", p.key, p.value);
323 println!(" # {}", p.reason);
324 }
325
326 println!();
327 println!("Apply: zernel tune apply");
328 println!("Export: zernel tune export --output zernel-tuned.conf");
329 }
330
331 TuneCommands::Apply { dry_run } => {
332 let hw = HardwareProfile::detect();
333 let params = generate_params(&hw);
334
335 if dry_run {
336 println!("Dry run — showing what would be applied:");
337 } else {
338 println!("Applying {} tuning parameters...", params.len());
339 }
340
341 for p in ¶ms {
342 if dry_run {
343 println!(" sysctl -w {} = {}", p.key, p.value);
344 } else {
345 let status = Command::new("sysctl")
346 .args(["-w", &format!("{}={}", p.key, p.value)])
347 .output();
348 match status {
349 Ok(o) if o.status.success() => {
350 println!(" OK: {} = {}", p.key, p.value);
351 }
352 _ => {
353 println!(" SKIP: {} (requires root)", p.key);
354 }
355 }
356 }
357 }
358
359 if !dry_run {
360 println!();
361 println!("Parameters applied. To persist across reboots:");
362 println!(" zernel tune export --output /etc/sysctl.d/99-zernel-tuned.conf");
363 }
364 }
365
366 TuneCommands::Diff => {
367 let hw = HardwareProfile::detect();
368 let params = generate_params(&hw);
369
370 println!("Current vs Optimal Parameters");
371 println!("{}", "=".repeat(70));
372 println!("{:<40} {:>12} {:>12}", "Parameter", "Current", "Optimal");
373 println!("{}", "-".repeat(70));
374
375 for p in ¶ms {
376 let current = Command::new("sysctl")
377 .args(["-n", &p.key])
378 .output()
379 .ok()
380 .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string())
381 .unwrap_or_else(|| "N/A".into());
382
383 let marker = if current.trim() == p.value.trim() {
384 " "
385 } else {
386 "→ "
387 };
388 println!("{}{:<38} {:>12} {:>12}", marker, p.key, current, p.value);
389 }
390 }
391
392 TuneCommands::Export { output } => {
393 let hw = HardwareProfile::detect();
394 let params = generate_params(&hw);
395
396 let mut conf = String::new();
397 conf.push_str("# Zernel Auto-Tuned Parameters\n");
398 conf.push_str(&format!(
399 "# Generated for: {} GPUs, {}GB RAM, {} cores, {} NUMA nodes\n",
400 hw.gpu_count,
401 hw.ram_mb / 1024,
402 hw.cpu_cores,
403 hw.numa_nodes
404 ));
405 conf.push_str(&format!(
406 "# Generated at: {}\n\n",
407 chrono::Utc::now().to_rfc3339()
408 ));
409
410 for p in ¶ms {
411 conf.push_str(&format!("# {}\n", p.reason));
412 conf.push_str(&format!("{} = {}\n\n", p.key, p.value));
413 }
414
415 std::fs::write(&output, &conf)?;
416 println!("Exported {} parameters to: {output}", params.len());
417 println!("Apply: sudo cp {output} /etc/sysctl.d/ && sudo sysctl --system");
418 }
419 }
420 Ok(())
421}