zernel/commands/
tune.rs

1// Copyright (C) 2026 Dyber, Inc. — Proprietary
2
3//! zernel tune — Adaptive kernel parameter tuning
4//!
5//! Reads hardware configuration (GPU count, RAM, NVMe, network) and
6//! generates optimal sysctl + kernel parameters for the specific machine.
7//! Not a static config — adapts to actual hardware present.
8
9use anyhow::Result;
10use clap::Subcommand;
11use std::process::Command;
12
13#[derive(Subcommand)]
14pub enum TuneCommands {
15    /// Analyze hardware and show recommended parameters
16    Analyze,
17    /// Apply optimal parameters (requires root)
18    Apply {
19        /// Dry run — show what would be changed without applying
20        #[arg(long)]
21        dry_run: bool,
22    },
23    /// Show current vs optimal parameters
24    Diff,
25    /// Generate a sysctl.conf file for this machine
26    Export {
27        /// Output file
28        #[arg(long, default_value = "zernel-tuned.conf")]
29        output: String,
30    },
31}
32
33struct HardwareProfile {
34    gpu_count: u32,
35    gpu_memory_mb: u64,
36    ram_mb: u64,
37    cpu_cores: u32,
38    numa_nodes: u32,
39    nvme_count: u32,
40    network_speed_mbps: u32,
41    has_infiniband: bool,
42}
43
44impl HardwareProfile {
45    fn detect() -> Self {
46        let gpu_count = Command::new("nvidia-smi")
47            .args(["--query-gpu=count", "--format=csv,noheader"])
48            .output()
49            .ok()
50            .and_then(|o| {
51                String::from_utf8_lossy(&o.stdout)
52                    .trim()
53                    .lines()
54                    .next()
55                    .and_then(|s| s.parse().ok())
56            })
57            .unwrap_or(0);
58
59        let gpu_memory_mb = Command::new("nvidia-smi")
60            .args(["--query-gpu=memory.total", "--format=csv,noheader,nounits"])
61            .output()
62            .ok()
63            .and_then(|o| {
64                String::from_utf8_lossy(&o.stdout)
65                    .trim()
66                    .lines()
67                    .next()
68                    .and_then(|s| s.parse().ok())
69            })
70            .unwrap_or(0);
71
72        let ram_mb = {
73            #[cfg(target_os = "linux")]
74            {
75                std::fs::read_to_string("/proc/meminfo")
76                    .ok()
77                    .and_then(|s| {
78                        s.lines()
79                            .find(|l| l.starts_with("MemTotal:"))
80                            .and_then(|l| l.split_whitespace().nth(1))
81                            .and_then(|s| s.parse::<u64>().ok())
82                            .map(|kb| kb / 1024)
83                    })
84                    .unwrap_or(0)
85            }
86            #[cfg(not(target_os = "linux"))]
87            {
88                0u64
89            }
90        };
91
92        let cpu_cores = std::thread::available_parallelism()
93            .map(|n| n.get() as u32)
94            .unwrap_or(1);
95
96        #[cfg(target_os = "linux")]
97        let numa_nodes = std::fs::read_dir("/sys/devices/system/node")
98            .map(|entries| {
99                entries
100                    .flatten()
101                    .filter(|e| e.file_name().to_string_lossy().starts_with("node"))
102                    .count() as u32
103            })
104            .unwrap_or(1);
105        #[cfg(not(target_os = "linux"))]
106        let numa_nodes = 1u32;
107
108        #[cfg(target_os = "linux")]
109        let nvme_count = std::fs::read_dir("/sys/class/nvme")
110            .map(|entries| entries.flatten().count() as u32)
111            .unwrap_or(0);
112        #[cfg(not(target_os = "linux"))]
113        let nvme_count = 0u32;
114
115        let has_infiniband = std::path::Path::new("/sys/class/infiniband").exists();
116
117        let network_speed_mbps = if has_infiniband { 100000 } else { 25000 };
118
119        Self {
120            gpu_count,
121            gpu_memory_mb,
122            ram_mb,
123            cpu_cores,
124            numa_nodes,
125            nvme_count,
126            network_speed_mbps,
127            has_infiniband,
128        }
129    }
130}
131
132struct TuningParam {
133    key: String,
134    value: String,
135    reason: String,
136}
137
138fn generate_params(hw: &HardwareProfile) -> Vec<TuningParam> {
139    let mut params = Vec::new();
140
141    // === Memory ===
142    params.push(TuningParam {
143        key: "vm.swappiness".into(),
144        value: "0".into(),
145        reason: "ML servers should never swap — kills GPU performance".into(),
146    });
147
148    params.push(TuningParam {
149        key: "vm.overcommit_memory".into(),
150        value: "1".into(),
151        reason: "PyTorch requires memory overcommit for large allocations".into(),
152    });
153
154    // Huge pages — scale with GPU memory
155    let hugepages = if hw.gpu_memory_mb > 40000 {
156        2048 // 2GB for A100/H100
157    } else if hw.gpu_memory_mb > 16000 {
158        1024 // 1GB for V100/RTX 3090
159    } else {
160        512 // 512MB for smaller GPUs
161    };
162    params.push(TuningParam {
163        key: "vm.nr_hugepages".into(),
164        value: hugepages.to_string(),
165        reason: format!(
166            "Pre-allocate {}MB huge pages for GPU DMA (based on {}MB GPU memory)",
167            hugepages * 2,
168            hw.gpu_memory_mb
169        ),
170    });
171
172    // Dirty ratio — scale with RAM
173    let dirty_ratio = if hw.ram_mb > 256000 {
174        60
175    } else if hw.ram_mb > 128000 {
176        40
177    } else {
178        20
179    };
180    params.push(TuningParam {
181        key: "vm.dirty_ratio".into(),
182        value: dirty_ratio.to_string(),
183        reason: format!(
184            "Allow {}% dirty pages for large dataset writes ({}GB RAM)",
185            dirty_ratio,
186            hw.ram_mb / 1024
187        ),
188    });
189
190    params.push(TuningParam {
191        key: "vm.dirty_background_ratio".into(),
192        value: "10".into(),
193        reason: "Start background writeback at 10%".into(),
194    });
195
196    // === Network ===
197    let net_buf = if hw.has_infiniband || hw.network_speed_mbps >= 100000 {
198        268435456 // 256MB for InfiniBand/100GbE
199    } else if hw.network_speed_mbps >= 25000 {
200        134217728 // 128MB for 25GbE
201    } else {
202        67108864 // 64MB for 10GbE
203    };
204
205    params.push(TuningParam {
206        key: "net.core.rmem_max".into(),
207        value: net_buf.to_string(),
208        reason: format!(
209            "{}MB receive buffer for {}Gbps network (NCCL distributed training)",
210            net_buf / 1048576,
211            hw.network_speed_mbps / 1000
212        ),
213    });
214
215    params.push(TuningParam {
216        key: "net.core.wmem_max".into(),
217        value: net_buf.to_string(),
218        reason: format!("{}MB send buffer", net_buf / 1048576),
219    });
220
221    params.push(TuningParam {
222        key: "net.ipv4.tcp_rmem".into(),
223        value: format!("4096 87380 {net_buf}"),
224        reason: "TCP receive buffer auto-tuning range".into(),
225    });
226
227    params.push(TuningParam {
228        key: "net.ipv4.tcp_wmem".into(),
229        value: format!("4096 65536 {net_buf}"),
230        reason: "TCP send buffer auto-tuning range".into(),
231    });
232
233    params.push(TuningParam {
234        key: "net.ipv4.tcp_congestion_control".into(),
235        value: "bbr".into(),
236        reason: "BBR congestion control — better for datacenter workloads".into(),
237    });
238
239    let backlog = if hw.gpu_count > 4 { 500000 } else { 250000 };
240    params.push(TuningParam {
241        key: "net.core.netdev_max_backlog".into(),
242        value: backlog.to_string(),
243        reason: format!(
244            "Network backlog for {} GPUs (NCCL generates bursty traffic)",
245            hw.gpu_count
246        ),
247    });
248
249    // === NUMA ===
250    if hw.numa_nodes > 1 {
251        params.push(TuningParam {
252            key: "kernel.numa_balancing".into(),
253            value: "1".into(),
254            reason: format!("NUMA auto-balancing for {} NUMA nodes", hw.numa_nodes),
255        });
256    }
257
258    // === File handles ===
259    let file_max = if hw.gpu_count > 4 { 4194304 } else { 2097152 };
260    params.push(TuningParam {
261        key: "fs.file-max".into(),
262        value: file_max.to_string(),
263        reason: format!(
264            "{} max file handles (DataLoader workers + {} GPUs)",
265            file_max, hw.gpu_count
266        ),
267    });
268
269    params.push(TuningParam {
270        key: "fs.inotify.max_user_watches".into(),
271        value: "1048576".into(),
272        reason: "High inotify watches for dataset monitoring".into(),
273    });
274
275    // === Scheduler ===
276    if hw.cpu_cores > 16 {
277        params.push(TuningParam {
278            key: "kernel.sched_migration_cost_ns".into(),
279            value: "5000000".into(),
280            reason: format!(
281                "Reduce scheduler migration cost on {} cores (keep DataLoader threads on-core)",
282                hw.cpu_cores
283            ),
284        });
285    }
286
287    params
288}
289
290pub async fn run(cmd: TuneCommands) -> Result<()> {
291    match cmd {
292        TuneCommands::Analyze => {
293            let hw = HardwareProfile::detect();
294
295            println!("Zernel Hardware Analysis");
296            println!("{}", "=".repeat(60));
297            println!();
298            println!("Detected Hardware:");
299            println!(
300                "  GPUs:         {} ({}MB each)",
301                hw.gpu_count, hw.gpu_memory_mb
302            );
303            println!("  RAM:          {} GB", hw.ram_mb / 1024);
304            println!("  CPU cores:    {}", hw.cpu_cores);
305            println!("  NUMA nodes:   {}", hw.numa_nodes);
306            println!("  NVMe drives:  {}", hw.nvme_count);
307            println!(
308                "  Network:      {}Gbps {}",
309                hw.network_speed_mbps / 1000,
310                if hw.has_infiniband {
311                    "(InfiniBand)"
312                } else {
313                    ""
314                }
315            );
316            println!();
317
318            let params = generate_params(&hw);
319            println!("Recommended Parameters ({} total):", params.len());
320            println!("{}", "-".repeat(60));
321            for p in &params {
322                println!("  {} = {}", p.key, p.value);
323                println!("    # {}", p.reason);
324            }
325
326            println!();
327            println!("Apply: zernel tune apply");
328            println!("Export: zernel tune export --output zernel-tuned.conf");
329        }
330
331        TuneCommands::Apply { dry_run } => {
332            let hw = HardwareProfile::detect();
333            let params = generate_params(&hw);
334
335            if dry_run {
336                println!("Dry run — showing what would be applied:");
337            } else {
338                println!("Applying {} tuning parameters...", params.len());
339            }
340
341            for p in &params {
342                if dry_run {
343                    println!("  sysctl -w {} = {}", p.key, p.value);
344                } else {
345                    let status = Command::new("sysctl")
346                        .args(["-w", &format!("{}={}", p.key, p.value)])
347                        .output();
348                    match status {
349                        Ok(o) if o.status.success() => {
350                            println!("  OK: {} = {}", p.key, p.value);
351                        }
352                        _ => {
353                            println!("  SKIP: {} (requires root)", p.key);
354                        }
355                    }
356                }
357            }
358
359            if !dry_run {
360                println!();
361                println!("Parameters applied. To persist across reboots:");
362                println!("  zernel tune export --output /etc/sysctl.d/99-zernel-tuned.conf");
363            }
364        }
365
366        TuneCommands::Diff => {
367            let hw = HardwareProfile::detect();
368            let params = generate_params(&hw);
369
370            println!("Current vs Optimal Parameters");
371            println!("{}", "=".repeat(70));
372            println!("{:<40} {:>12} {:>12}", "Parameter", "Current", "Optimal");
373            println!("{}", "-".repeat(70));
374
375            for p in &params {
376                let current = Command::new("sysctl")
377                    .args(["-n", &p.key])
378                    .output()
379                    .ok()
380                    .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string())
381                    .unwrap_or_else(|| "N/A".into());
382
383                let marker = if current.trim() == p.value.trim() {
384                    "  "
385                } else {
386                    "→ "
387                };
388                println!("{}{:<38} {:>12} {:>12}", marker, p.key, current, p.value);
389            }
390        }
391
392        TuneCommands::Export { output } => {
393            let hw = HardwareProfile::detect();
394            let params = generate_params(&hw);
395
396            let mut conf = String::new();
397            conf.push_str("# Zernel Auto-Tuned Parameters\n");
398            conf.push_str(&format!(
399                "# Generated for: {} GPUs, {}GB RAM, {} cores, {} NUMA nodes\n",
400                hw.gpu_count,
401                hw.ram_mb / 1024,
402                hw.cpu_cores,
403                hw.numa_nodes
404            ));
405            conf.push_str(&format!(
406                "# Generated at: {}\n\n",
407                chrono::Utc::now().to_rfc3339()
408            ));
409
410            for p in &params {
411                conf.push_str(&format!("# {}\n", p.reason));
412                conf.push_str(&format!("{} = {}\n\n", p.key, p.value));
413            }
414
415            std::fs::write(&output, &conf)?;
416            println!("Exported {} parameters to: {output}", params.len());
417            println!("Apply: sudo cp {output} /etc/sysctl.d/ && sudo sysctl --system");
418        }
419    }
420    Ok(())
421}