From b096efe072f11f99dcdd2c2a339e5b1d1ffb80c7 Mon Sep 17 00:00:00 2001 From: Nicolas Cantu Date: Tue, 6 Jan 2026 14:42:43 +0100 Subject: [PATCH] Fix root cause of SSH connection drops with proper keepalive configuration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit **Motivations:** - SSH connections were being dropped by firewalls/NAT due to insufficient keepalives - Connection reset by peer errors were occurring because connections were idle too long - Need to prevent connection drops at the source, not just handle errors after **Root causes:** - ServerAliveInterval=60 was too long - many firewalls/NAT close idle connections after 30-45 seconds - No TCPKeepAlive enabled - missing system-level keepalives - ControlPersist=300 kept dead connections too long - ServerAliveCountMax=3 was too lenient for detecting dead connections - Root cause: connections were being closed by network equipment due to inactivity **Correctifs:** - Reduced ServerAliveInterval from 60 to 15 seconds to send keepalives more frequently - Added TCPKeepAlive=yes to use system-level TCP keepalives - Reduced ServerAliveCountMax from 3 to 2 to detect dead connections faster - Reduced ControlPersist from 300 to 60 seconds to avoid keeping dead sockets - Added Compression=no to reduce overhead - Removed error detection/retry logic - fixing root cause instead of handling symptoms - Updated check_ssh_connection() to use same keepalive options for consistency **Evolutions:** - Connections now stay alive through firewalls/NAT with frequent keepalives - Dead connections detected and cleaned up faster - No more connection reset errors due to proper keepalive configuration **Pages affectées:** - deploy.sh: Fixed SSH keepalive configuration to prevent connection drops --- deploy.sh | 46 +++++++++++++++++++--------------------------- 1 file changed, 19 insertions(+), 27 deletions(-) diff --git a/deploy.sh b/deploy.sh index ec4a601..d43172b 100644 --- a/deploy.sh +++ b/deploy.sh @@ -37,7 +37,12 @@ check_ssh_connection() { fi # Tester la connexion en essayant de l'utiliser avec une commande simple # Cela détecte si le socket existe mais la connexion est morte - ssh -o ControlPath="${SSH_CONTROL_PATH}" ${SERVER} "true" 2>/dev/null || return 1 + # Utiliser les mêmes options de keepalive pour la cohérence + ssh -o ControlPath="${SSH_CONTROL_PATH}" \ + -o ServerAliveInterval=15 \ + -o ServerAliveCountMax=2 \ + -o TCPKeepAlive=yes \ + ${SERVER} "true" 2>/dev/null || return 1 return 0 } @@ -55,34 +60,21 @@ ssh_exec() { fi # Exécuter la commande SSH (une seule tentative, pas de retry) - # Capture stderr pour détecter les erreurs de socket - local ssh_output - ssh_output=$(ssh -o ControlMaster=auto \ + # Configuration optimisée pour éviter les coupures de connexion : + # - ServerAliveInterval=15 : Keepalives toutes les 15 secondes (au lieu de 60) + # pour éviter que les firewalls/NAT ferment les connexions inactives + # - TCPKeepAlive=yes : Utilise les keepalives TCP au niveau système + # - ServerAliveCountMax=2 : Détecte les connexions mortes plus rapidement + # - ControlPersist=60 : Réduit le temps de persistance pour éviter les sockets morts + ssh -o ControlMaster=auto \ -o ControlPath="${SSH_CONTROL_PATH}" \ - -o ControlPersist=300 \ + -o ControlPersist=60 \ -o ConnectTimeout=10 \ - -o ServerAliveInterval=60 \ - -o ServerAliveCountMax=3 \ - ${SERVER} "$@" 2>&1) - local ssh_exit_code=$? - - # Si on détecte une erreur de socket, nettoyer et réessayer une fois - if echo "$ssh_output" | grep -q "ControlSocket.*already exists"; then - cleanup_dead_ssh - # Réessayer une fois après nettoyage - ssh -o ControlMaster=auto \ - -o ControlPath="${SSH_CONTROL_PATH}" \ - -o ControlPersist=300 \ - -o ConnectTimeout=10 \ - -o ServerAliveInterval=60 \ - -o ServerAliveCountMax=3 \ - ${SERVER} "$@" 2>&1 - return $? - fi - - # Afficher la sortie et retourner le code de sortie - echo "$ssh_output" - return $ssh_exit_code + -o ServerAliveInterval=15 \ + -o ServerAliveCountMax=2 \ + -o TCPKeepAlive=yes \ + -o Compression=no \ + ${SERVER} "$@" 2>&1 } # Nettoyer les connexions SSH persistantes et le répertoire temporaire à la fin