zernel/
main.rs

1// Copyright (C) 2026 Dyber, Inc. — Proprietary
2
3#![allow(dead_code)]
4
5mod commands;
6mod experiments;
7mod telemetry;
8pub mod validation;
9mod zql;
10
11use anyhow::Result;
12use clap::{Parser, Subcommand};
13
14#[derive(Parser)]
15#[command(name = "zernel")]
16#[command(about = "Zernel — AI-Native ML Developer Environment")]
17#[command(version)]
18#[command(after_help = "Examples:
19  zernel init my-project         Scaffold a new ML project
20  zernel run train.py            Run with automatic tracking
21  zernel watch                   Live GPU + training dashboard
22  zernel gpu status              GPU management
23  zernel bench all               Run ML benchmark suite
24  zernel debug why-slow          Diagnose training bottlenecks
25  zernel data profile ./data     Dataset statistics
26  zernel cluster status          Cluster overview
27  zernel serve start ./model     Start inference server
28  zernel exp list                List all experiments
29  zernel model save ./ckpt       Save a model checkpoint
30  zernel job submit train.py     Submit distributed training
31  zernel doctor                  Diagnose environment
32  zernel query \"SELECT ...\"      Query with ZQL")]
33struct Cli {
34    #[command(subcommand)]
35    command: Commands,
36}
37
38#[derive(Subcommand)]
39enum Commands {
40    /// Initialize a new ML project
41    Init { name: String },
42    /// Run a training script with automatic telemetry and experiment tracking
43    Run {
44        script: String,
45        #[arg(trailing_var_arg = true)]
46        args: Vec<String>,
47    },
48    /// Live dashboard — GPU utilization, training metrics, eBPF telemetry
49    Watch,
50    /// Diagnose environment issues
51    Doctor,
52    /// GPU management — top, mem, kill, lock, health
53    #[command(subcommand)]
54    Gpu(commands::gpu::GpuCommands),
55    /// ML benchmark suite — gpu, nccl, dataloader, memory, e2e
56    #[command(subcommand)]
57    Bench(commands::bench::BenchCommands),
58    /// ML training debugger — why-slow, oom, nan, hang
59    #[command(subcommand)]
60    Debug(commands::debug::DebugCommands),
61    /// Experiment tracking
62    #[command(subcommand)]
63    Exp(commands::exp::ExpCommands),
64    /// Show training logs
65    Log {
66        #[arg(long)]
67        id: Option<String>,
68        #[arg(long, short)]
69        follow: bool,
70        #[arg(long)]
71        grep: Option<String>,
72    },
73    /// Dataset management — profile, split, cache, shard
74    #[command(subcommand)]
75    Data(commands::data::DataCommands),
76    /// Model registry
77    #[command(subcommand)]
78    Model(commands::model::ModelCommands),
79    /// Unified inference server — start, stop, benchmark
80    #[command(subcommand)]
81    Serve(commands::serve::ServeCommands),
82    /// Private model & dataset hub
83    #[command(subcommand)]
84    Hub(commands::hub::HubCommands),
85    /// Distributed job management
86    #[command(subcommand)]
87    Job(commands::job::JobCommands),
88    /// GPU cluster management — add, status, sync, run, drain
89    #[command(subcommand)]
90    Cluster(commands::cluster::ClusterCommands),
91    /// Environment management — snapshot, diff, reproduce, export
92    #[command(subcommand)]
93    Env(commands::env::EnvCommands),
94    /// Smart GPU power management & energy tracking
95    #[command(subcommand)]
96    Power(commands::power::PowerCommands),
97    /// Training optimizations — precision, memory, checkpoints, NUMA
98    #[command(subcommand)]
99    Optimize(commands::optimize::OptimizeCommands),
100    /// Autonomous training optimizer — monitors and fixes problems automatically
101    #[command(subcommand)]
102    Autopilot(commands::autopilot::AutopilotCommands),
103    /// GPU cloud management — launch, manage, destroy clusters
104    #[command(subcommand)]
105    Cloud(commands::cloud::CloudCommands),
106    /// Model marketplace — publish, browse, download, deploy
107    #[command(subcommand)]
108    Marketplace(commands::marketplace::MarketplaceCommands),
109    /// Adaptive kernel parameter tuning based on hardware
110    #[command(subcommand)]
111    Tune(commands::tune::TuneCommands),
112    /// Live job migration between GPUs
113    #[command(subcommand)]
114    Migrate(commands::migrate::MigrateCommands),
115    /// Full training pipeline profiler with waterfall
116    #[command(subcommand)]
117    Profile(commands::profile::ProfileCommands),
118    /// System hardening for production ML
119    #[command(subcommand)]
120    Secure(commands::secure::SecureCommands),
121    /// GPU fleet management — cost attribution, idle detection, capacity planning
122    #[command(subcommand)]
123    Fleet(commands::fleet::FleetCommands),
124    /// Compliance audit trail — lineage, provenance, HIPAA/SOC2 exports
125    #[command(subcommand)]
126    Audit(commands::audit::AuditCommands),
127    /// Developer onboarding — one-command setup, env sync, sharing
128    #[command(subcommand)]
129    Onboard(commands::onboard::OnboardCommands),
130    /// Post-Quantum Cryptography — sign, verify, encrypt, decrypt
131    #[command(subcommand)]
132    Pqc(commands::pqc::PqcCommands),
133    /// GPU cost tracking — summary, budget, report
134    #[command(subcommand)]
135    Cost(commands::cost::CostCommands),
136    /// Jupyter notebook management
137    #[command(subcommand)]
138    Notebook(commands::notebook::NotebookCommands),
139    /// Query experiments, jobs, models with ZQL
140    Query { query: String },
141    /// Install ML tools (pytorch, ollama, jupyter, etc.)
142    Install { tool: String },
143}
144
145#[tokio::main]
146async fn main() -> Result<()> {
147    tracing_subscriber::fmt()
148        .with_env_filter(std::env::var("ZERNEL_LOG").unwrap_or_else(|_| "zernel=warn".into()))
149        .init();
150
151    let cli = Cli::parse();
152
153    match cli.command {
154        Commands::Init { name } => commands::init::run(&name).await,
155        Commands::Run { script, args } => commands::run::run(&script, &args).await,
156        Commands::Watch => commands::watch::run().await,
157        Commands::Doctor => commands::doctor::run().await,
158        Commands::Gpu(cmd) => commands::gpu::run(cmd).await,
159        Commands::Bench(cmd) => commands::bench::run(cmd).await,
160        Commands::Debug(cmd) => commands::debug::run(cmd).await,
161        Commands::Exp(cmd) => commands::exp::run(cmd).await,
162        Commands::Log { id, follow, grep } => commands::log::run(id, follow, grep).await,
163        Commands::Data(cmd) => commands::data::run(cmd).await,
164        Commands::Model(cmd) => commands::model::run(cmd).await,
165        Commands::Serve(cmd) => commands::serve::run(cmd).await,
166        Commands::Hub(cmd) => commands::hub::run(cmd).await,
167        Commands::Job(cmd) => commands::job::run(cmd).await,
168        Commands::Cluster(cmd) => commands::cluster::run(cmd).await,
169        Commands::Env(cmd) => commands::env::run(cmd).await,
170        Commands::Cost(cmd) => commands::cost::run(cmd).await,
171        Commands::Notebook(cmd) => commands::notebook::run(cmd).await,
172        Commands::Power(cmd) => commands::power::run(cmd).await,
173        Commands::Optimize(cmd) => commands::optimize::run(cmd).await,
174        Commands::Autopilot(cmd) => commands::autopilot::run(cmd).await,
175        Commands::Cloud(cmd) => commands::cloud::run(cmd).await,
176        Commands::Marketplace(cmd) => commands::marketplace::run(cmd).await,
177        Commands::Tune(cmd) => commands::tune::run(cmd).await,
178        Commands::Migrate(cmd) => commands::migrate::run(cmd).await,
179        Commands::Profile(cmd) => commands::profile::run(cmd).await,
180        Commands::Secure(cmd) => commands::secure::run(cmd).await,
181        Commands::Pqc(cmd) => commands::pqc::run(cmd).await,
182        Commands::Fleet(cmd) => commands::fleet::run(cmd).await,
183        Commands::Audit(cmd) => commands::audit::run(cmd).await,
184        Commands::Onboard(cmd) => commands::onboard::run(cmd).await,
185        Commands::Query { query } => {
186            let result = zql::executor::execute(&query)?;
187            println!("{result}");
188            Ok(())
189        }
190        Commands::Install { tool } => {
191            let status = std::process::Command::new("zernel-install")
192                .arg(&tool)
193                .status();
194            match status {
195                Ok(s) if s.success() => Ok(()),
196                _ => {
197                    println!("zernel-install not in PATH. Run: sudo cp distro/scripts/zernel-install /usr/local/bin/");
198                    Ok(())
199                }
200            }
201        }
202    }
203}