pub struct AggregatedMetrics {
pub gpu_memory: HashMap<String, GpuMemMetrics>,
pub cuda_latency: HashMap<u32, LatencyHistogram>,
pub nccl_duration: HashMap<String, LatencyHistogram>,
pub dataloader_wait: HashMap<u32, LatencyHistogram>,
pub dist_sync: HashMap<u32, LatencyHistogram>,
pub last_update_ms: u64,
}Expand description
Aggregated metrics computed from raw BPF events.
Fields§
§gpu_memory: HashMap<String, GpuMemMetrics>§cuda_latency: HashMap<u32, LatencyHistogram>§nccl_duration: HashMap<String, LatencyHistogram>§dataloader_wait: HashMap<u32, LatencyHistogram>§dist_sync: HashMap<u32, LatencyHistogram>§last_update_ms: u64Timestamp (ms since epoch) of last update.
Implementations§
Source§impl AggregatedMetrics
impl AggregatedMetrics
Sourcepub fn record_gpu_mem(&mut self, pid: u32, gpu_id: u32, used: u64, peak: u64)
pub fn record_gpu_mem(&mut self, pid: u32, gpu_id: u32, used: u64, peak: u64)
Record a GPU memory event.
Sourcepub fn record_cuda_latency(&mut self, pid: u32, latency_ns: u64)
pub fn record_cuda_latency(&mut self, pid: u32, latency_ns: u64)
Record a CUDA kernel launch latency.
Sourcepub fn record_nccl(&mut self, op: &str, duration_ns: u64)
pub fn record_nccl(&mut self, op: &str, duration_ns: u64)
Record an NCCL collective duration.
Sourcepub fn record_dataloader_wait(&mut self, pid: u32, wait_ns: u64)
pub fn record_dataloader_wait(&mut self, pid: u32, wait_ns: u64)
Record a DataLoader wait time.
Sourcepub fn to_prometheus(&self) -> String
pub fn to_prometheus(&self) -> String
Format as Prometheus text exposition.
Sourcepub fn to_ws_snapshot(&self) -> Value
pub fn to_ws_snapshot(&self) -> Value
Build a JSON snapshot for WebSocket push to CLI.
Trait Implementations§
Source§impl Clone for AggregatedMetrics
impl Clone for AggregatedMetrics
Source§fn clone(&self) -> AggregatedMetrics
fn clone(&self) -> AggregatedMetrics
Returns a duplicate of the value. Read more
1.0.0 · Source§fn clone_from(&mut self, source: &Self)
fn clone_from(&mut self, source: &Self)
Performs copy-assignment from
source. Read moreSource§impl Debug for AggregatedMetrics
impl Debug for AggregatedMetrics
Source§impl Default for AggregatedMetrics
impl Default for AggregatedMetrics
Source§fn default() -> AggregatedMetrics
fn default() -> AggregatedMetrics
Returns the “default value” for a type. Read more
Auto Trait Implementations§
impl Freeze for AggregatedMetrics
impl RefUnwindSafe for AggregatedMetrics
impl Send for AggregatedMetrics
impl Sync for AggregatedMetrics
impl Unpin for AggregatedMetrics
impl UnwindSafe for AggregatedMetrics
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more