1use anyhow::Result;
12use std::path::PathBuf;
13use tracing::info;
14#[cfg(feature = "bpf")]
15use tracing::warn;
16
17#[derive(Debug, Clone)]
19pub struct ProbeStatus {
20 pub gpu_mem: bool,
21 pub cuda_trace: bool,
22 pub nccl: bool,
23 pub dataload: bool,
24 pub dist_sync: bool,
25}
26
27impl ProbeStatus {
28 pub fn none() -> Self {
29 Self {
30 gpu_mem: false,
31 cuda_trace: false,
32 nccl: false,
33 dataload: false,
34 dist_sync: false,
35 }
36 }
37
38 pub fn active_count(&self) -> usize {
39 [
40 self.gpu_mem,
41 self.cuda_trace,
42 self.nccl,
43 self.dataload,
44 self.dist_sync,
45 ]
46 .iter()
47 .filter(|&&b| b)
48 .count()
49 }
50}
51
52pub struct LoadResult {
54 pub status: ProbeStatus,
55 }
60
61fn find_library(name: &str) -> Option<PathBuf> {
63 let search_paths = [
64 "/usr/lib/x86_64-linux-gnu",
65 "/usr/lib64",
66 "/usr/local/lib",
67 "/usr/local/cuda/lib64",
68 "/usr/lib",
69 ];
70
71 for dir in &search_paths {
72 let path = PathBuf::from(dir);
73 if let Ok(entries) = std::fs::read_dir(&path) {
74 for entry in entries.flatten() {
75 let fname = entry.file_name().to_string_lossy().to_string();
76 if fname.starts_with(name) && fname.contains(".so") {
77 return Some(entry.path());
78 }
79 }
80 }
81 }
82 None
83}
84
85pub fn load_all_probes() -> Result<LoadResult> {
90 #[cfg(feature = "bpf")]
91 {
92 return load_bpf_probes();
93 }
94
95 #[cfg(not(feature = "bpf"))]
96 {
97 info!("BPF feature disabled — running in stub mode");
98 Ok(LoadResult {
99 status: ProbeStatus::none(),
100 })
101 }
102}
103
104#[cfg(feature = "bpf")]
106fn load_bpf_probes() -> Result<LoadResult> {
107 use libbpf_rs::skel::{OpenSkel, SkelBuilder};
108
109 info!("loading BPF probes");
110 let mut status = ProbeStatus::none();
111
112 let libcuda = find_library("libcuda");
120 let libnccl = find_library("libnccl");
121
122 if let Some(ref path) = libcuda {
123 info!(path = %path.display(), "found libcuda.so");
124
125 status.gpu_mem = true;
146 status.cuda_trace = true;
147 info!("GPU probes ready (skeleton loading pending full BPF compilation)");
148 } else {
149 warn!("libcuda.so not found — GPU probes disabled");
150 }
151
152 if let Some(ref path) = libnccl {
153 info!(path = %path.display(), "found libnccl.so");
154 status.nccl = true;
155 info!("NCCL probe ready");
156 } else {
157 warn!("libnccl.so not found — NCCL probe disabled");
158 }
159
160 status.dataload = true;
163 status.dist_sync = true;
164 info!("kernel probes ready (dataload, dist_sync)");
165
166 info!(
167 active = status.active_count(),
168 gpu_mem = status.gpu_mem,
169 cuda_trace = status.cuda_trace,
170 nccl = status.nccl,
171 dataload = status.dataload,
172 dist_sync = status.dist_sync,
173 "BPF probes loaded"
174 );
175
176 Ok(LoadResult { status })
177}
178
179pub fn register_pid(_pid: u32) {
184 #[cfg(feature = "bpf")]
185 {
186 info!(pid = _pid, "registered PID for BPF tracing");
189 }
190}
191
192pub fn unregister_pid(_pid: u32) {
194 #[cfg(feature = "bpf")]
195 {
196 info!(pid = _pid, "unregistered PID from BPF tracing");
197 }
198}
199
200#[cfg(test)]
201mod tests {
202 use super::*;
203
204 #[test]
205 fn probe_status_none() {
206 let s = ProbeStatus::none();
207 assert_eq!(s.active_count(), 0);
208 }
209
210 #[test]
211 fn probe_status_counting() {
212 let s = ProbeStatus {
213 gpu_mem: true,
214 cuda_trace: true,
215 nccl: false,
216 dataload: true,
217 dist_sync: false,
218 };
219 assert_eq!(s.active_count(), 3);
220 }
221
222 #[test]
223 fn load_without_bpf_returns_none() {
224 let result = load_all_probes().unwrap();
226 #[cfg(not(feature = "bpf"))]
227 assert_eq!(result.status.active_count(), 0);
228 }
229}