diff --git a/deploy.sh b/deploy.sh index b86056b..a729e9d 100644 --- a/deploy.sh +++ b/deploy.sh @@ -14,14 +14,59 @@ SSH_CONTROL_DIR="/tmp/ssh_control_$$" mkdir -p "${SSH_CONTROL_DIR}" SSH_CONTROL_PATH="${SSH_CONTROL_DIR}/debian_92.243.27.35_22" -# Fonction pour exécuter une commande SSH avec connexion persistante +# Fonction pour nettoyer une connexion SSH morte +cleanup_dead_ssh() { + ssh -O exit -o ControlPath="${SSH_CONTROL_PATH}" ${SERVER} 2>/dev/null || true + rm -f "${SSH_CONTROL_PATH}" 2>/dev/null || true +} + +# Fonction pour vérifier si la connexion SSH maître est valide +check_ssh_connection() { + ssh -O check -o ControlPath="${SSH_CONTROL_PATH}" ${SERVER} 2>/dev/null || return 1 +} + +# Fonction pour exécuter une commande SSH avec connexion persistante et gestion d'erreurs robuste ssh_exec() { - ssh -o ControlMaster=auto -o ControlPath="${SSH_CONTROL_PATH}" -o ControlPersist=300 ${SERVER} "$@" + local max_retries=3 + local retry_count=0 + + while [ $retry_count -lt $max_retries ]; do + # Vérifier si la connexion maître existe et est valide + if [ -S "${SSH_CONTROL_PATH}" ]; then + if ! check_ssh_connection; then + # Connexion morte, nettoyer avant de réessayer + cleanup_dead_ssh + fi + fi + + # Exécuter la commande SSH + if ssh -o ControlMaster=auto \ + -o ControlPath="${SSH_CONTROL_PATH}" \ + -o ControlPersist=300 \ + -o ConnectTimeout=10 \ + -o ServerAliveInterval=60 \ + -o ServerAliveCountMax=3 \ + ${SERVER} "$@" 2>&1; then + return 0 + else + local exit_code=$? + retry_count=$((retry_count + 1)) + + if [ $retry_count -lt $max_retries ]; then + # Nettoyer la connexion morte avant de réessayer + cleanup_dead_ssh + sleep 1 + else + # Dernière tentative échouée, retourner le code d'erreur + return $exit_code + fi + fi + done } # Nettoyer les connexions SSH persistantes et le répertoire temporaire à la fin cleanup_ssh() { - ssh -O exit -o ControlPath="${SSH_CONTROL_PATH}" ${SERVER} 2>/dev/null || true + cleanup_dead_ssh rm -rf "${SSH_CONTROL_DIR}" 2>/dev/null || true } trap cleanup_ssh EXIT @@ -99,12 +144,26 @@ fi # Vérifier si Git est initialisé sur le serveur echo "" echo "5. Vérification du dépôt Git sur le serveur..." -if ssh_exec "cd ${APP_DIR} && git status >/dev/null 2>&1"; then +GIT_STATUS_OUTPUT=$(ssh_exec "cd ${APP_DIR} && git status >/dev/null 2>&1 && echo 'OK' || echo 'NOT_INIT'") +if echo "$GIT_STATUS_OUTPUT" | grep -q "OK"; then echo " ✓ Dépôt Git détecté" -else +elif echo "$GIT_STATUS_OUTPUT" | grep -q "NOT_INIT"; then echo " ⚠ Dépôt Git non initialisé, initialisation..." ssh_exec "cd ${APP_DIR} && git init && git remote add origin ${GIT_REPO} 2>/dev/null || git remote set-url origin ${GIT_REPO}" ssh_exec "cd ${APP_DIR} && git checkout -b ${BRANCH} 2>/dev/null || true" +else + echo " ✗ Erreur de connexion SSH lors de la vérification du dépôt Git" + echo " Tentative de nettoyage et nouvelle connexion..." + cleanup_dead_ssh + sleep 2 + # Réessayer une fois après nettoyage + if ssh_exec "cd ${APP_DIR} && git status >/dev/null 2>&1"; then + echo " ✓ Dépôt Git détecté après réessai" + else + echo " ⚠ Dépôt Git non initialisé, initialisation..." + ssh_exec "cd ${APP_DIR} && git init && git remote add origin ${GIT_REPO} 2>/dev/null || git remote set-url origin ${GIT_REPO}" + ssh_exec "cd ${APP_DIR} && git checkout -b ${BRANCH} 2>/dev/null || true" + fi fi # Récupérer les dernières modifications diff --git a/fixKnowledge/ssh-connection-errors-deployment.md b/fixKnowledge/ssh-connection-errors-deployment.md new file mode 100644 index 0000000..7300b6d --- /dev/null +++ b/fixKnowledge/ssh-connection-errors-deployment.md @@ -0,0 +1,179 @@ +# SSH Connection Errors During Deployment + +**Date**: 2024-12-19 +**Auteur**: Équipe 4NK + +## Problem Description + +During deployment, SSH connection errors occurred when verifying the Git repository on the server. The errors were: + +``` +mux_client_request_session: read from master failed: Connection reset by peer +Failed to connect to new control master +mm_send_fd: sendmsg(2): Broken pipe +mux_client_request_session: send fds failed +``` + +These errors appeared at step 5 of the deployment script when checking if Git is initialized on the server. + +## Impact + +- **Severity**: Medium +- **Scope**: Deployment script reliability +- **User Impact**: Deployment could fail or continue with errors, potentially leaving the server in an inconsistent state +- **Frequency**: Intermittent, occurring when SSH ControlMaster connection is interrupted + +## Root Cause + +The SSH ControlMaster multiplexing connection was being closed prematurely or becoming stale, causing subsequent SSH commands to fail. The original `ssh_exec` function did not handle connection failures robustly: + +1. **No connection validation**: The function did not check if the ControlMaster socket was still valid before use +2. **No retry mechanism**: Failed connections were not retried after cleanup +3. **No dead connection cleanup**: Stale connections were not detected and cleaned up before reuse +4. **Silent failures**: Connection errors in conditional checks could be misinterpreted as command failures + +## Root Cause Analysis + +The SSH ControlMaster feature creates a persistent connection to avoid multiple SSH handshakes. However: + +- Network interruptions can close the master connection +- The ControlMaster socket file can become stale if the connection dies +- The script did not detect or handle these cases, leading to cascading failures + +## Corrections Applied + +### 1. Enhanced SSH Connection Management + +**File**: `deploy.sh` + +**Changes**: +- Added `cleanup_dead_ssh()` function to properly clean up dead SSH connections +- Added `check_ssh_connection()` function to validate ControlMaster connection before use +- Enhanced `ssh_exec()` function with: + - Connection validation before each command + - Automatic cleanup of dead connections + - Retry mechanism (up to 3 attempts) + - Additional SSH options for better connection stability: + - `ConnectTimeout=10`: Fail fast if connection cannot be established + - `ServerAliveInterval=60`: Keep connection alive + - `ServerAliveCountMax=3`: Detect dead connections quickly + +### 2. Improved Error Handling at Step 5 + +**File**: `deploy.sh` + +**Changes**: +- Enhanced Git repository verification to properly handle SSH connection errors +- Added explicit error detection and recovery mechanism +- Added automatic retry after connection cleanup +- Better error messages to distinguish between connection errors and Git initialization needs + +## Modifications + +### Files Modified + +- `deploy.sh`: + - Enhanced `ssh_exec()` function with retry logic and connection validation + - Added `cleanup_dead_ssh()` and `check_ssh_connection()` helper functions + - Improved error handling in Git repository verification step + +### Code Changes + +**Before**: +```bash +ssh_exec() { + ssh -o ControlMaster=auto -o ControlPath="${SSH_CONTROL_PATH}" -o ControlPersist=300 ${SERVER} "$@" +} +``` + +**After**: +```bash +ssh_exec() { + # Validates connection, cleans up dead connections, retries on failure + # Includes connection stability options +} +``` + +## Deployment Procedures + +### Automatic Deployment + +The fix is automatically applied when using the deployment script: + +```bash +./deploy.sh "commit message" +``` + +No manual intervention required. The script now handles SSH connection errors automatically. + +### Verification + +After deployment, verify that SSH connections are stable: + +1. Check that deployment completes without SSH errors +2. Monitor for connection errors in subsequent deployments +3. Verify that retry mechanism works correctly + +## Analysis Procedures + +### Monitoring SSH Connection Issues + +1. **Check deployment logs** for SSH connection errors: + ```bash + # Review recent deployment output + ``` + +2. **Verify SSH ControlMaster socket**: + ```bash + # On the deployment machine + ls -la /tmp/ssh_control_*/ + ``` + +3. **Test SSH connection manually**: + ```bash + ssh -O check -o ControlPath="/tmp/ssh_control_*/debian_92.243.27.35_22" debian@92.243.27.35 + ``` + +### Debugging Steps + +If SSH connection errors persist: + +1. Check network connectivity to the server +2. Verify SSH server configuration allows ControlMaster +3. Check for firewall or network issues +4. Review SSH server logs on the remote machine +5. Verify SSH key authentication is working + +### Logs to Review + +- Deployment script output (stdout/stderr) +- SSH client logs (if verbose mode enabled) +- Remote SSH server logs: `/var/log/auth.log` or similar + +## Prevention + +### Best Practices + +1. **Connection validation**: Always validate SSH connections before use +2. **Retry logic**: Implement retry mechanisms for network operations +3. **Cleanup**: Properly clean up stale connections +4. **Error handling**: Distinguish between different types of failures +5. **Monitoring**: Monitor connection stability over time + +### Future Improvements + +- Add connection health metrics +- Implement exponential backoff for retries +- Add connection pooling if needed +- Consider alternative connection methods if ControlMaster proves unreliable + +## Related Issues + +None identified at this time. + +## References + +- SSH ControlMaster documentation +- Deployment script: `deploy.sh` +- Related documentation: `docs/deployment.md` +