pub async fn run_k8s_job(
job_id: &str,
script: &str,
image: &str,
gpus_per_node: u32,
nodes: u32,
namespace: &str,
args: &[String],
log_dir: &Path,
) -> Result<i32>Expand description
Submit a distributed training job to Kubernetes via PyTorchJob CRD.