zernel_ebpf/consumers/
gpu_mem.rs

1// Copyright (C) 2026 Dyber, Inc. — GPL-2.0
2
3use serde::Serialize;
4use tracing::warn;
5
6/// GPU memory allocation event from BPF uprobe on libcuda.so.
7/// Layout must match struct zernel_gpu_mem_event in common.h.
8#[derive(Debug, Clone, Serialize)]
9#[repr(C)]
10pub struct GpuMemEvent {
11    pub pid: u32,
12    pub gpu_id: u32,
13    pub alloc_bytes: u64,
14    pub free_bytes: u64,
15    pub total_usage: u64,
16    pub timestamp_ns: u64,
17}
18
19/// Processes GPU memory events from the BPF ring buffer.
20pub struct GpuMemConsumer {
21    /// OOM warning threshold as fraction of total GPU memory (0.0 - 1.0).
22    pub oom_threshold: f64,
23    /// Total GPU memory per device in bytes (for OOM calculation).
24    pub gpu_total_bytes: u64,
25}
26
27impl GpuMemConsumer {
28    pub fn new(oom_threshold: f64, gpu_total_bytes: u64) -> Self {
29        Self {
30            oom_threshold,
31            gpu_total_bytes,
32        }
33    }
34
35    /// Process a raw event from the BPF ring buffer.
36    /// Returns the deserialized event, or None if the buffer is too small.
37    pub fn process_event(&self, raw: &[u8]) -> Option<GpuMemEvent> {
38        let event = deserialize_event::<GpuMemEvent>(raw)?;
39
40        // Check OOM threshold
41        if self.gpu_total_bytes > 0 {
42            let usage_frac = event.total_usage as f64 / self.gpu_total_bytes as f64;
43            if usage_frac > self.oom_threshold {
44                warn!(
45                    pid = event.pid,
46                    gpu_id = event.gpu_id,
47                    usage_pct = usage_frac * 100.0,
48                    "GPU memory usage exceeds OOM threshold"
49                );
50            }
51        }
52
53        Some(event)
54    }
55}
56
57/// Safely deserialize a C struct from a raw byte buffer.
58/// Returns None if the buffer is too small.
59fn deserialize_event<T: Clone>(raw: &[u8]) -> Option<T> {
60    if raw.len() < std::mem::size_of::<T>() {
61        return None;
62    }
63    // SAFETY: We verified the buffer is large enough. The BPF ring buffer
64    // guarantees the data was written as this exact struct layout (matching
65    // #[repr(C)] on the Rust side and the C struct in common.h).
66    let event = unsafe { &*(raw.as_ptr() as *const T) };
67    Some(event.clone())
68}
69
70#[cfg(test)]
71mod tests {
72    use super::*;
73
74    #[test]
75    fn deserialize_gpu_mem_event() {
76        let event = GpuMemEvent {
77            pid: 1234,
78            gpu_id: 0,
79            alloc_bytes: 1024 * 1024,
80            free_bytes: 0,
81            total_usage: 80 * 1024 * 1024 * 1024,
82            timestamp_ns: 999999,
83        };
84
85        // Serialize to raw bytes (simulating BPF ring buffer)
86        let raw = unsafe {
87            std::slice::from_raw_parts(
88                &event as *const GpuMemEvent as *const u8,
89                std::mem::size_of::<GpuMemEvent>(),
90            )
91        };
92
93        let consumer = GpuMemConsumer::new(0.95, 80 * 1024 * 1024 * 1024);
94        let result = consumer.process_event(raw).unwrap();
95        assert_eq!(result.pid, 1234);
96        assert_eq!(result.alloc_bytes, 1024 * 1024);
97    }
98
99    #[test]
100    fn rejects_undersized_buffer() {
101        let consumer = GpuMemConsumer::new(0.95, 80 * 1024 * 1024 * 1024);
102        assert!(consumer.process_event(&[0u8; 4]).is_none());
103    }
104}