13.4. Network Automation#
13.4.1. Common Pitfalls#
1. Not validating commands before remote execution
# Bad: Typo affects all servers
for server in "${SERVERS[@]}"; do
ssh "$server" 'systemctl restart nigix' # Typo!
done
# Good: Validate locally first
cmd='systemctl restart nginx'
echo "Command to execute: $cmd"
read -p "Continue? (y/n) " -n 1
[[ $REPLY =~ ^[Yy]$ ]] || exit 1
for server in "${SERVERS[@]}"; do
ssh "$server" "$cmd"
done
2. Executing without rollback capability
# Bad: No backup before deployment
ssh server 'rm -rf /app; git clone repo'
# Good: Always backup first
ssh server << 'EOF'
tar -czf /backups/app-$(date +%s).tar.gz /app
cd /app
git fetch && git checkout new-version
EOF
3. Not handling partial failures in loops
# Bad: Continues if server is down
for server in "${SERVERS[@]}"; do
ssh "$server" 'important-task' # Fails silently if unreachable
done
# Good: Detect and handle failures
for server in "${SERVERS[@]}"; do
if ! ssh -o ConnectTimeout=5 "$server" 'important-task'; then
echo "ERROR: Task failed on $server" >&2
failed_servers+=("$server")
fi
done
[[ ${#failed_servers[@]} -eq 0 ]] || {
echo "Failed: ${failed_servers[*]}" >&2
exit 1
}
4. Not idempotent automation scripts
# Bad: Fails if run twice
ssh server 'mkdir /app/data' # Fails 2nd run: directory exists
# Good: Make operations idempotent
ssh server 'mkdir -p /app/data' # Always succeeds
ssh server '[[ -f config ]] || cp default.conf config'
5. Secrets in logs or error messages
# Bad: Password visible in error output
sshpass -p "$PASSWORD" ssh user@server 'command'
# Shows password in process list and logs
# Good: Use SSH keys
ssh -i ~/.ssh/key user@server 'command'
13.4.2. Real-World Deployment Workflow#
#!/bin/bash
set -euo pipefail
# AUTOMATED DEPLOYMENT PIPELINE
# Coordinates multi-server deployment with validation
readonly APP_NAME="myapp"
readonly VERSION="${1:-latest}"
readonly SERVERS=("web1" "web2")
readonly BACKUP_DIR="/backups/deployments"
declare -a ROLLED_BACK=()
# Step 1: Pre-deployment checks
pre_deployment_checks() {
echo "=== Pre-deployment checks ==="
# Check all servers are reachable
for server in "${SERVERS[@]}"; do
if ! ping -c 1 -W 2 "$server" > /dev/null 2>&1; then
echo "ERROR: Cannot reach $server" >&2
return 1
fi
done
# Check disk space on all servers
for server in "${SERVERS[@]}"; do
local available=$(ssh "$server" 'df /app | awk "NR==2{print \$4}"')
if [[ $available -lt 1000000 ]]; then
echo "ERROR: Insufficient disk space on $server" >&2
return 1
fi
done
echo "✓ All checks passed"
}
# Step 2: Backup current version
backup_current_version() {
echo "=== Backing up current version ==="
mkdir -p "$BACKUP_DIR"
for server in "${SERVERS[@]}"; do
echo "Backing up $server..."
ssh "$server" << EOF
cd /app
tar -czf /tmp/backup-\$(date +%s).tar.gz .
cp /tmp/backup-*.tar.gz /backups/
EOF
done
}
# Step 3: Deploy new version
deploy_new_version() {
echo "=== Deploying version $VERSION ==="
for server in "${SERVERS[@]}"; do
echo "Deploying to $server..."
if ! ssh "$server" << EOF
cd /app
git fetch origin
git checkout "$VERSION"
npm install --production
npm run build
systemctl restart "$APP_NAME"
EOF
then
echo "ERROR: Deployment failed on $server" >&2
ROLLED_BACK+=("$server")
rollback_server "$server"
fi
done
}
# Step 4: Health checks
health_checks() {
echo "=== Running health checks ==="
local wait_time=0
while [[ $wait_time -lt 30 ]]; do
local healthy=true
for server in "${SERVERS[@]}"; do
if ! ssh "$server" 'curl -f http://localhost:3000/health' > /dev/null 2>&1; then
healthy=false
break
fi
done
if [[ "$healthy" == "true" ]]; then
echo "✓ All services healthy"
return 0
fi
sleep 2
((wait_time += 2))
done
echo "ERROR: Services not healthy after 30 seconds" >&2
return 1
}
# Step 5: Rollback function
rollback_server() {
local server=$1
echo "Rolling back $server..."
ssh "$server" << 'EOF'
cd /app
latest_backup=$(ls -t /backups/*.tar.gz | head -1)
tar -xzf "$latest_backup" -C /app
systemctl restart myapp
EOF
}
# Main deployment
main() {
echo "Deploying $APP_NAME version $VERSION"
if ! pre_deployment_checks; then
echo "Pre-deployment checks failed"
return 1
fi
backup_current_version
if ! deploy_new_version || ! health_checks; then
echo "Deployment failed, rolling back..."
for server in "${ROLLED_BACK[@]}"; do
rollback_server "$server"
done
return 1
fi
echo "✓ Deployment successful"
}
main
13.4.3. Network Monitoring and Health Checks#
Automated monitoring detects issues before they impact users.
13.4.3.1. Service Health Monitoring#
#!/bin/bash
# CHECK MULTIPLE SERVICES ACROSS INFRASTRUCTURE
declare -A SERVICES=(
[web1:80]="HTTP"
[api1:3000]="API"
[db1:5432]="PostgreSQL"
[redis1:6379]="Redis"
)
check_service() {
local host=$1
local port=$2
local service=$3
# TCP connection test
if timeout 2 bash -c "echo > /dev/tcp/$host/$port" 2>/dev/null; then
echo "✓ $service ($host:$port) is accessible"
return 0
else
echo "✗ $service ($host:$port) is DOWN"
return 1
fi
}
# Monitor all services
monitor_services() {
local failed=0
for service_spec in "${!SERVICES[@]}"; do
local host=${service_spec%:*}
local port=${service_spec#*:}
local service=${SERVICES[$service_spec]}
if ! check_service "$host" "$port" "$service"; then
((failed++))
fi
done
echo "Summary: $failed services down"
return $failed
}
# HTTP-specific checks
check_http_health() {
local url=$1
local expected_code=${2:-200}
local response=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 "$url")
if [[ $response -eq $expected_code ]]; then
echo "✓ $url: HTTP $response"
return 0
else
echo "✗ $url: HTTP $response (expected $expected_code)"
return 1
fi
}
# API endpoint validation
validate_api() {
local endpoint=$1
local expected_keys=$2 # comma-separated keys to check
local response=$(curl -s "$endpoint")
# Check if response contains expected keys
for key in ${expected_keys//,/ }; do
if ! echo "$response" | jq -e ".$key" > /dev/null 2>&1; then
echo "✗ Missing key: $key"
return 1
fi
done
echo "✓ API endpoint valid"
return 0
}
13.4.3.2. Performance Monitoring#
#!/bin/bash
# COLLECT AND ALERT ON PERFORMANCE METRICS
monitor_server_performance() {
local server=$1
local cpu_threshold=80
local mem_threshold=85
local disk_threshold=90
# Get metrics remotely
local metrics=$(ssh "$server" 'bash -c '\
'echo -n "$(top -bn1 | grep "Cpu(s)" | awk "{print 100-\$8}") "'\
'echo -n "$(free | awk "NR==2{print \$3/\$2*100}") "'\
'echo "$(df / | awk "NR==2{print \$5}")"'\'')
local cpu=$(echo "$metrics" | awk '{print int($1)}')
local mem=$(echo "$metrics" | awk '{print int($2)}')
local disk=$(echo "$metrics" | awk '{print int($3)}')
# Alert if thresholds exceeded
local alerts=""
[[ $cpu -gt $cpu_threshold ]] && alerts+="CPU: ${cpu}% "
[[ $mem -gt $mem_threshold ]] && alerts+="Memory: ${mem}% "
[[ $disk -gt $disk_threshold ]] && alerts+="Disk: ${disk}% "
if [[ -n "$alerts" ]]; then
echo "ALERT on $server: $alerts"
# Send alert
mail -s "Alert: $server" admin@example.com << EOF
Performance alert on $server:
$alerts
EOF
return 1
fi
echo "✓ $server: CPU ${cpu}%, Memory ${mem}%, Disk ${disk}%"
return 0
}
13.4.4. Infrastructure Automation Scripts#
Automating common network tasks across multiple servers improves efficiency and consistency.
13.4.4.1. Multi-Server Management#
#!/bin/bash
set -euo pipefail
# MULTI-SERVER ORCHESTRATION
# Execute commands, collect results, handle failures
declare -a SERVERS=("web1" "web2" "api1" "db1")
declare -a FAILED_SERVERS=()
execute_on_servers() {
local command=$1
local parallel=${2:-false}
echo "Executing on ${#SERVERS[@]} servers: $command"
for server in "${SERVERS[@]}"; do
if [[ "$parallel" == "true" ]]; then
execute_safe "$server" "$command" &
else
execute_safe "$server" "$command"
fi
done
if [[ "$parallel" == "true" ]]; then
wait
fi
if [[ ${#FAILED_SERVERS[@]} -gt 0 ]]; then
echo "Failed on: ${FAILED_SERVERS[*]}"
return 1
fi
}
execute_safe() {
local server=$1
local cmd=$2
if ssh -o ConnectTimeout=5 "$server" "$cmd" 2>/dev/null; then
echo "✓ $server: success"
else
echo "✗ $server: failed"
FAILED_SERVERS+=("$server")
fi
}
# Usage
execute_on_servers 'systemctl status nginx' true
execute_on_servers 'df -h | tail -1' false
13.4.4.2. Configuration Management#
#!/bin/bash
# Deploy configuration to servers
deploy_config() {
local config_file=$1
local remote_path=$2
# Copy config
for server in "${SERVERS[@]}"; do
echo "Deploying to $server..."
scp "$config_file" "user@$server:$remote_path"
# Apply and validate
ssh "user@$server" << 'EOF'
# Validate syntax
if ! nginx -t -c "$1" 2>/dev/null; then
echo "Config validation failed"
exit 1
fi
systemctl reload nginx
EOF
done
}
# Backup all server configs
backup_all_configs() {
local backup_dir="/backups/configs-$(date +%Y%m%d)"
mkdir -p "$backup_dir"
for server in "${SERVERS[@]}"; do
echo "Backing up $server config..."
scp -r "user@$server:/etc/nginx/" "$backup_dir/$server/"
done
}
# Rolling restart (one at a time)
rolling_restart() {
local service=$1
for server in "${SERVERS[@]}"; do
echo "Restarting $service on $server..."
ssh "user@$server" "systemctl restart $service"
# Wait for service to be healthy
sleep 5
if ! ssh "user@$server" "systemctl is-active $service > /dev/null"; then
echo "ERROR: Service failed on $server" >&2
return 1
fi
done
}