From 5da6127a94be503deaa2c525e6b8d416db32b177 Mon Sep 17 00:00:00 2001 From: gursewak1997 Date: Fri, 19 Dec 2025 15:43:53 -0800 Subject: [PATCH] libvirt/ssh: Enhance SSH timeouts with time-based retry Reduce default SSH timeout from 30s to 5s for faster failure detection. Implement 60-second time-based retry with 1-second polling intervals. Do connectivity check first with retries, then oneshot command execution to prevent dangerous retries of user commands. Also add an integration test to test this. Signed-off-by: gursewak1997 --- .../src/tests/run_ephemeral_ssh.rs | 59 ++++++ crates/kit/src/libvirt/ssh.rs | 193 +++++++++++------- crates/kit/src/ssh.rs | 4 +- 3 files changed, 182 insertions(+), 74 deletions(-) diff --git a/crates/integration-tests/src/tests/run_ephemeral_ssh.rs b/crates/integration-tests/src/tests/run_ephemeral_ssh.rs index 8a4d025f..6703d7ee 100644 --- a/crates/integration-tests/src/tests/run_ephemeral_ssh.rs +++ b/crates/integration-tests/src/tests/run_ephemeral_ssh.rs @@ -398,3 +398,62 @@ fn test_run_ephemeral_dns_resolution() -> Result<()> { Ok(()) } integration_test!(test_run_ephemeral_dns_resolution); + +/// Test SSH timeout behavior when SSH is unavailable +/// +/// Verifies that ephemeral run-ssh properly times out when SSH is masked. +/// Uses systemd.mask=sshd.service to disable SSH, triggering the timeout mechanism. +/// +/// Note: This tests the ephemeral timeout (~240s), not the libvirt SSH timeout (~60s). +/// The libvirt SSH timeout (60s) is used by `bcvk libvirt ssh` and would require +/// creating a libvirt VM to test properly. +fn test_run_ephemeral_ssh_timeout() -> Result<()> { + eprintln!("Testing SSH timeout with masked sshd.service..."); + eprintln!("This test takes ~240 seconds to complete..."); + + let start = Instant::now(); + + let output = run_bcvk(&[ + "ephemeral", + "run-ssh", + "--label", + INTEGRATION_TEST_LABEL, + "--karg", + "systemd.mask=sshd.service", + &get_test_image(), + "--", + "echo", + "should not reach here", + ])?; + + let elapsed = start.elapsed(); + + // Command should fail (SSH timeout) + assert!( + !output.success(), + "Expected ephemeral run-ssh to fail with SSH timeout, but it succeeded" + ); + + // Verify the error message mentions timeout or readiness failure + assert!( + output.stderr.contains("Timeout waiting for readiness") + || output.stderr.contains("timeout") + || output.stderr.contains("failed after"), + "Expected error about timeout, got: {}", + output.stderr + ); + + // Verify timeout duration is approximately 240 seconds (±20s tolerance for CI variability) + let timeout_secs = elapsed.as_secs(); + assert!( + timeout_secs >= 220 && timeout_secs <= 260, + "Expected timeout around 240 seconds, but got {} seconds. \ + This suggests the ephemeral SSH timeout may not be working correctly.", + timeout_secs + ); + + eprintln!("✓ SSH timeout worked correctly ({}s)", timeout_secs); + + Ok(()) +} +integration_test!(test_run_ephemeral_ssh_timeout); diff --git a/crates/kit/src/libvirt/ssh.rs b/crates/kit/src/libvirt/ssh.rs index 577028cd..3d426154 100644 --- a/crates/kit/src/libvirt/ssh.rs +++ b/crates/kit/src/libvirt/ssh.rs @@ -15,9 +15,15 @@ use std::io::Write; use std::os::unix::fs::PermissionsExt as _; use std::os::unix::process::CommandExt; use std::process::Command; +use std::time::{Duration, Instant}; use tempfile; use tracing::debug; +// SSH retry configuration +const SSH_RETRY_TIMEOUT_SECS: u64 = 60; // Total time to retry SSH connections +const SSH_POLL_DELAY_SECS: u64 = 1; // Delay between SSH attempts +const SSH_SERVER_ALIVE_INTERVAL: u32 = 60; // Server alive interval in seconds + /// Configuration options for SSH connection to libvirt domain #[derive(Debug, Parser)] pub struct LibvirtSshOpts { @@ -36,7 +42,7 @@ pub struct LibvirtSshOpts { pub strict_host_keys: bool, /// SSH connection timeout in seconds - #[clap(long, default_value = "30")] + #[clap(long, default_value = "5")] pub timeout: u32, /// SSH log level @@ -236,8 +242,39 @@ impl LibvirtSshOpts { Ok(temp_key) } - /// Execute SSH connection to domain - fn connect_ssh(&self, ssh_config: &DomainSshConfig) -> Result<()> { + /// Build SSH command with configured options + fn build_ssh_command( + &self, + ssh_config: &DomainSshConfig, + temp_key: &tempfile::NamedTempFile, + parsed_extra_options: Vec<(String, String)>, + ) -> Command { + let mut ssh_cmd = Command::new("ssh"); + ssh_cmd + .arg("-i") + .arg(temp_key.path()) + .arg("-p") + .arg(ssh_config.ssh_port.to_string()); + + let common_opts = crate::ssh::CommonSshOptions { + strict_host_keys: self.strict_host_keys, + connect_timeout: self.timeout, + server_alive_interval: SSH_SERVER_ALIVE_INTERVAL, + log_level: self.log_level.clone(), + extra_options: parsed_extra_options, + }; + common_opts.apply_to_command(&mut ssh_cmd); + ssh_cmd.arg(format!("{}@127.0.0.1", self.user)); + + ssh_cmd + } + + /// Execute SSH connection to domain with retries + fn connect_ssh( + &self, + _global_opts: &crate::libvirt::LibvirtOptions, + ssh_config: &DomainSshConfig, + ) -> Result<()> { debug!( "Connecting to domain '{}' via SSH on port {} (user: {})", self.domain_name, ssh_config.ssh_port, self.user @@ -250,17 +287,7 @@ impl LibvirtSshOpts { // Create temporary SSH key file let temp_key = self.create_temp_ssh_key(ssh_config)?; - // Build SSH command - let mut ssh_cmd = Command::new("ssh"); - - // Add SSH key and port - ssh_cmd - .arg("-i") - .arg(temp_key.path()) - .arg("-p") - .arg(ssh_config.ssh_port.to_string()); - - // Parse extra options from key=value format + // Parse extra options let mut parsed_extra_options = Vec::new(); for option in &self.extra_options { if let Some((key, value)) = option.split_once('=') { @@ -273,76 +300,98 @@ impl LibvirtSshOpts { } } - // Apply common SSH options - let common_opts = crate::ssh::CommonSshOptions { - strict_host_keys: self.strict_host_keys, - connect_timeout: self.timeout, - server_alive_interval: 60, - log_level: self.log_level.clone(), - extra_options: parsed_extra_options, - }; - common_opts.apply_to_command(&mut ssh_cmd); + let start_time = Instant::now(); + let timeout = Duration::from_secs(SSH_RETRY_TIMEOUT_SECS); - // Target host - ssh_cmd.arg(format!("{}@127.0.0.1", self.user)); + // First, do connectivity check with retries (for both interactive and command) + debug!("Testing SSH connectivity before session"); - // Add command if specified - use the same argument escaping logic as container SSH - if !self.command.is_empty() { - ssh_cmd.arg("--"); - if self.command.len() > 1 { - // Multiple arguments need proper shell escaping - let combined_command = crate::ssh::shell_escape_command(&self.command) - .map_err(|e| eyre!("Failed to escape shell command: {}", e))?; - debug!("Combined escaped command: {}", combined_command); - ssh_cmd.arg(combined_command); - } else { - // Single argument can be passed directly - ssh_cmd.args(&self.command); + // Create progress bar for user feedback (only shown in terminals) + let pb = crate::boot_progress::create_boot_progress_bar(); + pb.set_message("Waiting for SSH to be ready..."); + + loop { + let mut test_cmd = + self.build_ssh_command(ssh_config, &temp_key, parsed_extra_options.clone()); + test_cmd.arg("--").arg("true"); // Simple test command + + let output = test_cmd.output().context("Failed to spawn SSH command")?; + + if output.status.success() { + debug!( + "SSH connectivity confirmed after {:.1}s", + start_time.elapsed().as_secs_f64() + ); + pb.finish_and_clear(); + break; } - } - debug!("Executing SSH command: {:?}", ssh_cmd); + // Check if we've exceeded timeout + if start_time.elapsed() >= timeout { + pb.finish_and_clear(); + if !self.suppress_output { + let stderr_str = String::from_utf8_lossy(&output.stderr); + eprint!("{}", stderr_str); + eprintln!( + "\nSSH connection failed after {:.1}s. To see VM console output, run: virsh console {}", + start_time.elapsed().as_secs_f64(), + self.domain_name + ); + } + return Err(eyre!("SSH connection failed after timeout")); + } - // For commands (non-interactive SSH), capture output - // For interactive SSH (no command), exec to replace current process - if self.command.is_empty() { - // Interactive SSH - exec to replace the current process - // This provides the cleanest terminal experience - debug!("Executing interactive SSH session via exec"); + std::thread::sleep(Duration::from_secs(SSH_POLL_DELAY_SECS)); + } + // SSH is ready - now do the actual operation (oneshot) + if self.command.is_empty() { + // Interactive: exec directly + debug!("SSH ready, launching interactive session"); + let mut ssh_cmd = self.build_ssh_command(ssh_config, &temp_key, parsed_extra_options); let error = ssh_cmd.exec(); - // exec() only returns on error return Err(eyre!("Failed to exec SSH command: {}", error)); + } + + // Command execution: single attempt since we already confirmed connectivity + debug!("SSH ready, executing command"); + let mut ssh_cmd = self.build_ssh_command(ssh_config, &temp_key, parsed_extra_options); + + // Add command + ssh_cmd.arg("--"); + if self.command.len() > 1 { + let combined_command = crate::ssh::shell_escape_command(&self.command) + .map_err(|e| eyre!("Failed to escape shell command: {}", e))?; + ssh_cmd.arg(combined_command); } else { - // Command execution - capture and forward output - let output = ssh_cmd - .output() - .map_err(|e| eyre!("Failed to execute SSH command: {}", e))?; + ssh_cmd.args(&self.command); + } - if !output.stdout.is_empty() { - if !self.suppress_output { - // Forward stdout to parent process - print!("{}", String::from_utf8_lossy(&output.stdout)); - } - debug!("SSH stdout: {}", String::from_utf8_lossy(&output.stdout)); - } - if !output.stderr.is_empty() { - if !self.suppress_output { - // Forward stderr to parent process - eprint!("{}", String::from_utf8_lossy(&output.stderr)); - } - debug!("SSH stderr: {}", String::from_utf8_lossy(&output.stderr)); - } + // Execute command + let output = ssh_cmd + .output() + .map_err(|e| eyre!("Failed to execute SSH command: {}", e))?; - if !output.status.success() { - return Err(eyre!( - "SSH connection failed with exit code: {}", - output.status.code().unwrap_or(-1) - )); + if output.status.success() { + if !output.stdout.is_empty() && !self.suppress_output { + print!("{}", String::from_utf8_lossy(&output.stdout)); } + debug!( + "Command completed successfully after {:.1}s total", + start_time.elapsed().as_secs_f64() + ); + return Ok(()); } - Ok(()) + // Command failed + if !self.suppress_output { + let stderr_str = String::from_utf8_lossy(&output.stderr); + eprint!("{}", stderr_str); + } + Err(eyre!( + "SSH command failed with exit code: {:?}", + output.status.code() + )) } } @@ -377,8 +426,8 @@ pub fn run_ssh_impl( // Extract SSH configuration from domain metadata let ssh_config = opts.extract_ssh_config(global_opts)?; - // Connect via SSH - opts.connect_ssh(&ssh_config)?; + // Connect via SSH with retries + opts.connect_ssh(global_opts, &ssh_config)?; Ok(()) } diff --git a/crates/kit/src/ssh.rs b/crates/kit/src/ssh.rs index 4fa78087..49177e73 100644 --- a/crates/kit/src/ssh.rs +++ b/crates/kit/src/ssh.rs @@ -220,7 +220,7 @@ impl Default for CommonSshOptions { fn default() -> Self { Self { strict_host_keys: false, - connect_timeout: 30, + connect_timeout: 1, server_alive_interval: 60, log_level: "ERROR".to_string(), extra_options: vec![], @@ -363,7 +363,7 @@ mod tests { fn test_ssh_connection_options() { // Test default options let default_opts = SshConnectionOptions::default(); - assert_eq!(default_opts.common.connect_timeout, 30); + assert_eq!(default_opts.common.connect_timeout, 1); assert!(default_opts.allocate_tty); assert_eq!(default_opts.common.log_level, "ERROR"); assert!(default_opts.common.extra_options.is_empty());