pub async fn run_ssh_job(
job_id: &str,
script: &str,
hosts: &[String],
gpus_per_node: u32,
framework: &str,
backend: &str,
args: &[String],
log_dir: &Path,
) -> Result<i32>Expand description
Launch a distributed training job across multiple nodes via SSH.