1use anyhow::Result;
14use std::process::Command;
15use tracing::{debug, info};
16
17#[derive(Debug, Clone, Copy)]
19pub struct PowerProfile {
20 pub graphics_clock: u32,
22 pub memory_clock: u32,
24 pub power_limit: u32,
26}
27
28pub fn profile_for_phase(
30 phase: &str,
31 max_graphics: u32,
32 max_memory: u32,
33 max_power: u32,
34) -> PowerProfile {
35 match phase {
36 "DataLoading" => PowerProfile {
37 graphics_clock: max_graphics / 3, memory_clock: max_memory, power_limit: (max_power as f32 * 0.6) as u32,
40 },
41 "GpuCompute" => PowerProfile {
42 graphics_clock: max_graphics, memory_clock: max_memory, power_limit: max_power,
45 },
46 "NcclCollective" => PowerProfile {
47 graphics_clock: max_graphics / 2, memory_clock: max_memory, power_limit: (max_power as f32 * 0.7) as u32,
50 },
51 "OptimizerStep" => PowerProfile {
52 graphics_clock: max_graphics, memory_clock: max_memory,
54 power_limit: max_power,
55 },
56 _ => PowerProfile {
57 graphics_clock: 0,
58 memory_clock: 0,
59 power_limit: 0,
60 },
61 }
62}
63
64pub fn apply_profile(gpu_id: u32, profile: &PowerProfile) -> Result<()> {
66 if profile.power_limit > 0 {
67 let status = Command::new("nvidia-smi")
68 .args([
69 "-i",
70 &gpu_id.to_string(),
71 "-pl",
72 &profile.power_limit.to_string(),
73 ])
74 .output();
75
76 match status {
77 Ok(o) if o.status.success() => {
78 debug!(gpu = gpu_id, power = profile.power_limit, "power limit set");
79 }
80 _ => {
81 debug!(gpu = gpu_id, "power limit change failed (requires root)");
82 }
83 }
84 }
85
86 if profile.graphics_clock > 0 && profile.memory_clock > 0 {
87 let status = Command::new("nvidia-smi")
88 .args([
89 "-i",
90 &gpu_id.to_string(),
91 "-ac",
92 &format!("{},{}", profile.memory_clock, profile.graphics_clock),
93 ])
94 .output();
95
96 match status {
97 Ok(o) if o.status.success() => {
98 debug!(
99 gpu = gpu_id,
100 graphics = profile.graphics_clock,
101 memory = profile.memory_clock,
102 "application clocks set"
103 );
104 }
105 _ => {
106 debug!(gpu = gpu_id, "clock change failed (requires root)");
107 }
108 }
109 }
110
111 Ok(())
112}
113
114pub fn reset_power(gpu_id: u32) -> Result<()> {
116 let _ = Command::new("nvidia-smi")
117 .args(["-i", &gpu_id.to_string(), "-rac"])
118 .output();
119 let _ = Command::new("nvidia-smi")
120 .args(["-i", &gpu_id.to_string(), "-rpl"])
121 .output();
122 info!(gpu = gpu_id, "power state reset to defaults");
123 Ok(())
124}
125
126pub fn get_max_clocks(gpu_id: u32) -> Option<(u32, u32, u32)> {
128 let output = Command::new("nvidia-smi")
129 .args([
130 "-i",
131 &gpu_id.to_string(),
132 "--query-gpu=clocks.max.graphics,clocks.max.memory,power.max_limit",
133 "--format=csv,noheader,nounits",
134 ])
135 .output()
136 .ok()?;
137
138 if !output.status.success() {
139 return None;
140 }
141
142 let stdout = String::from_utf8_lossy(&output.stdout);
143 let fields: Vec<&str> = stdout.trim().split(',').map(|s| s.trim()).collect();
144 if fields.len() >= 3 {
145 let graphics = fields[0].parse().ok()?;
146 let memory = fields[1].parse().ok()?;
147 let power = fields[2].parse::<f32>().ok()? as u32;
148 Some((graphics, memory, power))
149 } else {
150 None
151 }
152}
153
154pub struct EnergyTracker {
156 samples: Vec<(f64, f64)>, }
158
159impl EnergyTracker {
160 pub fn new() -> Self {
161 Self {
162 samples: Vec::new(),
163 }
164 }
165
166 pub fn record_sample(&mut self, timestamp_secs: f64, power_watts: f64) {
167 self.samples.push((timestamp_secs, power_watts));
168 }
169
170 pub fn total_kwh(&self) -> f64 {
172 if self.samples.len() < 2 {
173 return 0.0;
174 }
175 let mut energy_wh = 0.0;
176 for i in 1..self.samples.len() {
177 let dt_hours = (self.samples[i].0 - self.samples[i - 1].0) / 3600.0;
178 let avg_watts = (self.samples[i].1 + self.samples[i - 1].1) / 2.0;
179 energy_wh += avg_watts * dt_hours;
180 }
181 energy_wh / 1000.0
182 }
183
184 pub fn co2_kg(&self, grid_intensity_kg_per_kwh: f64) -> f64 {
186 self.total_kwh() * grid_intensity_kg_per_kwh
187 }
188}
189
190#[cfg(test)]
191mod tests {
192 use super::*;
193
194 #[test]
195 fn energy_tracking() {
196 let mut tracker = EnergyTracker::new();
197 tracker.record_sample(0.0, 300.0);
199 tracker.record_sample(3600.0, 300.0);
200 assert!((tracker.total_kwh() - 0.3).abs() < 0.01);
201 assert!((tracker.co2_kg(0.42) - 0.126).abs() < 0.01);
203 }
204
205 #[test]
206 fn phase_profiles() {
207 let p = profile_for_phase("DataLoading", 2100, 1215, 400);
208 assert!(p.graphics_clock < 2100); assert_eq!(p.memory_clock, 1215); assert!(p.power_limit < 400); let p = profile_for_phase("GpuCompute", 2100, 1215, 400);
213 assert_eq!(p.graphics_clock, 2100); assert_eq!(p.power_limit, 400); }
216}