zernel_ebpf/
power.rs

1// Copyright (C) 2026 Dyber, Inc. — GPL-2.0
2
3//! Smart GPU Power Management
4//!
5//! Dynamically adjusts GPU power states based on ML workload phase:
6//! - GpuCompute: full power (max clocks)
7//! - DataLoading: reduce GPU clocks (GPU mostly idle, save power)
8//! - NcclCollective: reduce compute clocks, keep memory clocks high
9//! - OptimizerStep: brief burst, keep full power
10//!
11//! Can reduce energy consumption by 10-20% with <1% throughput impact.
12
13use anyhow::Result;
14use std::process::Command;
15use tracing::{debug, info};
16
17/// GPU power profile for each ML workload phase.
18#[derive(Debug, Clone, Copy)]
19pub struct PowerProfile {
20    /// Graphics clock (MHz) — 0 means "don't change"
21    pub graphics_clock: u32,
22    /// Memory clock (MHz) — 0 means "don't change"
23    pub memory_clock: u32,
24    /// Power limit (Watts) — 0 means "don't change"
25    pub power_limit: u32,
26}
27
28/// Default power profiles for each phase.
29pub fn profile_for_phase(
30    phase: &str,
31    max_graphics: u32,
32    max_memory: u32,
33    max_power: u32,
34) -> PowerProfile {
35    match phase {
36        "DataLoading" => PowerProfile {
37            graphics_clock: max_graphics / 3, // GPU mostly idle
38            memory_clock: max_memory,         // keep memory fast for H2D
39            power_limit: (max_power as f32 * 0.6) as u32,
40        },
41        "GpuCompute" => PowerProfile {
42            graphics_clock: max_graphics, // full compute
43            memory_clock: max_memory,     // full memory
44            power_limit: max_power,
45        },
46        "NcclCollective" => PowerProfile {
47            graphics_clock: max_graphics / 2, // less compute needed
48            memory_clock: max_memory,         // memory for transfers
49            power_limit: (max_power as f32 * 0.7) as u32,
50        },
51        "OptimizerStep" => PowerProfile {
52            graphics_clock: max_graphics, // brief CPU+GPU burst
53            memory_clock: max_memory,
54            power_limit: max_power,
55        },
56        _ => PowerProfile {
57            graphics_clock: 0,
58            memory_clock: 0,
59            power_limit: 0,
60        },
61    }
62}
63
64/// Apply a power profile to a specific GPU.
65pub fn apply_profile(gpu_id: u32, profile: &PowerProfile) -> Result<()> {
66    if profile.power_limit > 0 {
67        let status = Command::new("nvidia-smi")
68            .args([
69                "-i",
70                &gpu_id.to_string(),
71                "-pl",
72                &profile.power_limit.to_string(),
73            ])
74            .output();
75
76        match status {
77            Ok(o) if o.status.success() => {
78                debug!(gpu = gpu_id, power = profile.power_limit, "power limit set");
79            }
80            _ => {
81                debug!(gpu = gpu_id, "power limit change failed (requires root)");
82            }
83        }
84    }
85
86    if profile.graphics_clock > 0 && profile.memory_clock > 0 {
87        let status = Command::new("nvidia-smi")
88            .args([
89                "-i",
90                &gpu_id.to_string(),
91                "-ac",
92                &format!("{},{}", profile.memory_clock, profile.graphics_clock),
93            ])
94            .output();
95
96        match status {
97            Ok(o) if o.status.success() => {
98                debug!(
99                    gpu = gpu_id,
100                    graphics = profile.graphics_clock,
101                    memory = profile.memory_clock,
102                    "application clocks set"
103                );
104            }
105            _ => {
106                debug!(gpu = gpu_id, "clock change failed (requires root)");
107            }
108        }
109    }
110
111    Ok(())
112}
113
114/// Reset GPU to default power state.
115pub fn reset_power(gpu_id: u32) -> Result<()> {
116    let _ = Command::new("nvidia-smi")
117        .args(["-i", &gpu_id.to_string(), "-rac"])
118        .output();
119    let _ = Command::new("nvidia-smi")
120        .args(["-i", &gpu_id.to_string(), "-rpl"])
121        .output();
122    info!(gpu = gpu_id, "power state reset to defaults");
123    Ok(())
124}
125
126/// Query max clocks for a GPU.
127pub fn get_max_clocks(gpu_id: u32) -> Option<(u32, u32, u32)> {
128    let output = Command::new("nvidia-smi")
129        .args([
130            "-i",
131            &gpu_id.to_string(),
132            "--query-gpu=clocks.max.graphics,clocks.max.memory,power.max_limit",
133            "--format=csv,noheader,nounits",
134        ])
135        .output()
136        .ok()?;
137
138    if !output.status.success() {
139        return None;
140    }
141
142    let stdout = String::from_utf8_lossy(&output.stdout);
143    let fields: Vec<&str> = stdout.trim().split(',').map(|s| s.trim()).collect();
144    if fields.len() >= 3 {
145        let graphics = fields[0].parse().ok()?;
146        let memory = fields[1].parse().ok()?;
147        let power = fields[2].parse::<f32>().ok()? as u32;
148        Some((graphics, memory, power))
149    } else {
150        None
151    }
152}
153
154/// Track energy consumption over time.
155pub struct EnergyTracker {
156    samples: Vec<(f64, f64)>, // (timestamp_secs, power_watts)
157}
158
159impl EnergyTracker {
160    pub fn new() -> Self {
161        Self {
162            samples: Vec::new(),
163        }
164    }
165
166    pub fn record_sample(&mut self, timestamp_secs: f64, power_watts: f64) {
167        self.samples.push((timestamp_secs, power_watts));
168    }
169
170    /// Total energy in kWh.
171    pub fn total_kwh(&self) -> f64 {
172        if self.samples.len() < 2 {
173            return 0.0;
174        }
175        let mut energy_wh = 0.0;
176        for i in 1..self.samples.len() {
177            let dt_hours = (self.samples[i].0 - self.samples[i - 1].0) / 3600.0;
178            let avg_watts = (self.samples[i].1 + self.samples[i - 1].1) / 2.0;
179            energy_wh += avg_watts * dt_hours;
180        }
181        energy_wh / 1000.0
182    }
183
184    /// Estimated CO2 emissions in kg (US average grid: 0.42 kg CO2/kWh).
185    pub fn co2_kg(&self, grid_intensity_kg_per_kwh: f64) -> f64 {
186        self.total_kwh() * grid_intensity_kg_per_kwh
187    }
188}
189
190#[cfg(test)]
191mod tests {
192    use super::*;
193
194    #[test]
195    fn energy_tracking() {
196        let mut tracker = EnergyTracker::new();
197        // 300W for 1 hour = 0.3 kWh
198        tracker.record_sample(0.0, 300.0);
199        tracker.record_sample(3600.0, 300.0);
200        assert!((tracker.total_kwh() - 0.3).abs() < 0.01);
201        // US grid: 0.42 kg/kWh
202        assert!((tracker.co2_kg(0.42) - 0.126).abs() < 0.01);
203    }
204
205    #[test]
206    fn phase_profiles() {
207        let p = profile_for_phase("DataLoading", 2100, 1215, 400);
208        assert!(p.graphics_clock < 2100); // reduced
209        assert_eq!(p.memory_clock, 1215); // kept high
210        assert!(p.power_limit < 400); // reduced
211
212        let p = profile_for_phase("GpuCompute", 2100, 1215, 400);
213        assert_eq!(p.graphics_clock, 2100); // full
214        assert_eq!(p.power_limit, 400); // full
215    }
216}