🛠️ AWS CLI Mastery
December 18, 2025
Error Handling en Scripts AWS: Patrones y Mejores Prácticas
📋 Contexto
Un script AWS sin manejo de errores adecuado es una bomba de tiempo. He visto scripts que borran recursos productivos, dejan recursos huérfanos, o fallan silenciosamente causando incidentes horas después. Este post cubre patrones de error handling que uso en todos mis scripts AWS.
🎯 Principios Fundamentales
1. Fail Fast con set -e
#!/bin/bash
# Script que falla en el primer error
set -e # Exit on error
set -u # Exit on undefined variable
set -o pipefail # Fail on pipe errors
# Ahora cualquier comando que falle detiene el script
aws ec2 describe-instances --instance-ids i-invalid # Script se detiene aquí
echo "Esto nunca se ejecuta"
2. Validar Prerequisitos
#!/bin/bash
# validate-prereqs.sh
# Check AWS CLI installed
if ! command -v aws &> /dev/null; then
echo "ERROR: AWS CLI not installed"
exit 1
fi
# Check authentication
if ! aws sts get-caller-identity &> /dev/null; then
echo "ERROR: Not authenticated with AWS"
echo "Run: aws sso login --profile YOUR_PROFILE"
exit 1
fi
# Check required environment variables
: "${AWS_REGION:?AWS_REGION not set}"
: "${ENVIRONMENT:?ENVIRONMENT not set}"
# Check required permissions
if ! aws ec2 describe-instances --dry-run &> /dev/null; then
echo "ERROR: Missing EC2 permissions"
exit 1
fi
echo "✓ All prerequisites met"
3. Capturar y Loggear Errores
#!/bin/bash
# error-logging.sh
LOG_FILE="/var/log/aws-scripts/$(basename $0 .sh)-$(date +%Y%m%d).log"
mkdir -p "$(dirname "$LOG_FILE")"
# Logging functions
log() {
echo "[$(date +'%Y-%m-%d %H:%M:%S')] [INFO] $*" | tee -a "$LOG_FILE"
}
error() {
echo "[$(date +'%Y-%m-%d %H:%M:%S')] [ERROR] $*" | tee -a "$LOG_FILE" >&2
}
# Error handler
handle_error() {
local line_no=$1
local exit_code=$2
error "Script failed at line $line_no with exit code $exit_code"
error "Command: ${BASH_COMMAND}"
# Cleanup code aquí
cleanup_on_error
exit "$exit_code"
}
trap 'handle_error ${LINENO} $?' ERR
# Usage
log "Starting deployment..."
aws ec2 start-instances --instance-ids i-xxx || error "Failed to start instance"
🔄 Patrón: Rollback en Caso de Error
#!/bin/bash
# deploy-with-rollback.sh
set -euo pipefail
BACKUP_AMI=""
INSTANCE_ID="i-1234567890abcdef0"
# Create backup before changes
create_backup() {
log "Creating AMI backup..."
BACKUP_AMI=$(aws ec2 create-image \
--instance-id "$INSTANCE_ID" \
--name "backup-$(date +%Y%m%d-%H%M%S)" \
--no-reboot \
--query 'ImageId' \
--output text)
log "Backup AMI created: $BACKUP_AMI"
# Wait for AMI to be available
aws ec2 wait image-available --image-ids "$BACKUP_AMI"
}
# Rollback function
rollback() {
if [ -n "$BACKUP_AMI" ]; then
error "Deployment failed, rolling back..."
# Stop instance
aws ec2 stop-instances --instance-ids "$INSTANCE_ID"
aws ec2 wait instance-stopped --instance-ids "$INSTANCE_ID"
# Restore from AMI would go here
# (creating new instance from backup AMI)
log "Rollback completed"
fi
}
# Trap errors for rollback
trap rollback ERR
# Main deployment
create_backup
log "Deploying new version..."
# Deployment commands here
aws ssm send-command --instance-ids "$INSTANCE_ID" --document-name "Deploy"
log "✓ Deployment successful"
🛡️ Patrón: Validación Antes de Acciones Destructivas
#!/bin/bash
# safe-delete.sh - Delete with confirmations
delete_instance() {
local instance_id=$1
# 1. Get instance details
local instance_info=$(aws ec2 describe-instances \
--instance-ids "$instance_id" \
--query 'Reservations[0].Instances[0].[Tags[?Key==`Name`].Value|[0],InstanceType,State.Name]' \
--output text)
echo "Instance to delete:"
echo " ID: $instance_id"
echo " Info: $instance_info"
echo ""
# 2. Check if production
local env=$(aws ec2 describe-instances \
--instance-ids "$instance_id" \
--query 'Reservations[0].Instances[0].Tags[?Key==`Environment`].Value|[0]' \
--output text)
if [ "$env" = "production" ]; then
echo "⚠️ WARNING: This is a PRODUCTION instance!"
read -p "Type 'DELETE PRODUCTION' to confirm: " confirm
[ "$confirm" != "DELETE PRODUCTION" ] && echo "Aborted" && exit 1
fi
# 3. Dry-run first
if ! aws ec2 terminate-instances --instance-ids "$instance_id" --dry-run 2>&1 | grep -q "DryRunOperation"; then
error "Dry-run failed, check permissions"
exit 1
fi
# 4. Final confirmation
read -p "Proceed with deletion? (yes/no): " confirm
[ "$confirm" != "yes" ] && echo "Aborted" && exit 0
# 5. Take snapshot first
echo "Creating final snapshot..."
# ... snapshot code ...
# 6. Execute deletion
aws ec2 terminate-instances --instance-ids "$instance_id"
log "✓ Instance $instance_id terminated"
}
delete_instance "${1:?Instance ID required}"
⏱️ Patrón: Timeouts y Retries
#!/bin/bash
# retry-with-timeout.sh
# Retry function with exponential backoff
retry() {
local max_attempts=$1
shift
local cmd=("$@")
local attempt=1
local delay=1
while [ $attempt -le $max_attempts ]; do
log "Attempt $attempt/$max_attempts: ${cmd[*]}"
if "${cmd[@]}"; then
log "✓ Command succeeded"
return 0
fi
if [ $attempt -lt $max_attempts ]; then
warn "Command failed, retrying in ${delay}s..."
sleep $delay
delay=$((delay * 2)) # Exponential backoff
fi
attempt=$((attempt + 1))
done
error "Command failed after $max_attempts attempts"
return 1
}
# Usage with timeout
timeout() {
local timeout=$1
shift
local cmd=("$@")
# Run command with timeout
if timeout "$timeout" "${cmd[@]}"; then
return 0
else
local exit_code=$?
if [ $exit_code -eq 124 ]; then
error "Command timed out after ${timeout}s"
fi
return $exit_code
fi
}
# Example: Retry AWS command with timeout
retry 3 timeout 30 aws ec2 describe-instances --instance-ids i-xxx
📊 Patrón: Validación de Resultados
#!/bin/bash
# validate-results.sh
# Get result and validate
get_instance_state() {
local instance_id=$1
local expected_state=$2
local max_wait=${3:-300} # 5 minutes default
log "Waiting for instance $instance_id to be $expected_state..."
local elapsed=0
local interval=10
while [ $elapsed -lt $max_wait ]; do
local state=$(aws ec2 describe-instances \
--instance-ids "$instance_id" \
--query 'Reservations[0].Instances[0].State.Name' \
--output text)
if [ "$state" = "$expected_state" ]; then
log "✓ Instance is $expected_state"
return 0
fi
log "Current state: $state (waiting...)"
sleep $interval
elapsed=$((elapsed + interval))
done
error "Timeout: Instance did not reach $expected_state state"
return 1
}
# Start instance and validate
aws ec2 start-instances --instance-ids i-xxx
get_instance_state i-xxx "running" 300 || {
error "Failed to start instance"
exit 1
}
🔍 Patrón: Debugging Mode
#!/bin/bash
# debug-mode.sh
# Enable debug mode with environment variable
if [ "${DEBUG:-0}" = "1" ]; then
set -x # Print all commands
export AWS_DEBUG=1 # AWS CLI debug mode
fi
# Conditional debug logging
debug() {
[ "${DEBUG:-0}" = "1" ] && echo "[DEBUG] $*" >&2
}
# Usage
debug "Processing instance: $instance_id"
debug "Current AWS_PROFILE: $AWS_PROFILE"
# Run script with debug:
# DEBUG=1 ./script.sh
💰 Patrón: Cost Guard (Prevenir Gastos Excesivos)
#!/bin/bash
# cost-guard.sh
check_cost_limit() {
local action=$1
local resource_count=$2
# Calculate estimated cost
case $action in
"create-instances")
local hourly_cost=0.10 # t3.medium
local estimated_monthly=$((resource_count * hourly_cost * 730))
;;
"create-snapshots")
local gb_cost=0.05
local estimated_monthly=$((resource_count * 100 * gb_cost)) # Assume 100GB each
;;
esac
if [ "$estimated_monthly" -gt 1000 ]; then
error "Estimated monthly cost: \$$estimated_monthly exceeds \$1000 limit"
read -p "Override? (yes/no): " confirm
[ "$confirm" != "yes" ] && exit 1
fi
log "Estimated monthly cost: \$$estimated_monthly"
}
# Usage
check_cost_limit "create-instances" 20
⚠️ Errores Comunes a Evitar:
• No usar
• Ignorar exit codes con
• No validar inputs del usuario
• No loggear errores para debugging posterior
• No implementar timeouts (scripts que cuelgan indefinidamente)
• No usar
set -e y continuar después de errores• Ignorar exit codes con
|| true• No validar inputs del usuario
• No loggear errores para debugging posterior
• No implementar timeouts (scripts que cuelgan indefinidamente)
📝 Template Completo
#!/bin/bash
# production-ready-script.sh
set -euo pipefail
IFS=$'\n\t'
# Configuration
readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
readonly LOG_DIR="/var/log/aws-scripts"
readonly LOG_FILE="$LOG_DIR/$(basename $0 .sh)-$(date +%Y%m%d).log"
# Colors
readonly RED='\033[0;31m'
readonly GREEN='\033[0;32m'
readonly YELLOW='\033[1;33m'
readonly NC='\033[0m'
# Logging
log() { echo -e "${GREEN}[$(date +'%Y-%m-%d %H:%M:%S')]${NC} $*" | tee -a "$LOG_FILE"; }
error() { echo -e "${RED}[ERROR]${NC} $*" | tee -a "$LOG_FILE" >&2; }
warn() { echo -e "${YELLOW}[WARN]${NC} $*" | tee -a "$LOG_FILE"; }
# Error handling
handle_error() {
error "Script failed at line $1 with exit code $2"
cleanup_on_error
exit "$2"
}
trap 'handle_error ${LINENO} $?' ERR
cleanup_on_error() {
warn "Performing cleanup..."
# Cleanup code
}
# Prerequisite checks
check_prerequisites() {
command -v aws >/dev/null || { error "AWS CLI not found"; exit 1; }
aws sts get-caller-identity &>/dev/null || { error "Not authenticated"; exit 1; }
}
# Main logic
main() {
mkdir -p "$LOG_DIR"
log "Starting script..."
check_prerequisites
# Your code here
log "✓ Script completed successfully"
}
main "$@"
✅ Checklist para Scripts Production-Ready:
•
• Validación de prerequisitos
• Logging estructurado
• Error handling con trap
• Cleanup on error
• Validación de inputs
• Dry-run para operaciones destructivas
• Timeouts y retries
• Documentación y help message
•
set -euo pipefail activado• Validación de prerequisitos
• Logging estructurado
• Error handling con trap
• Cleanup on error
• Validación de inputs
• Dry-run para operaciones destructivas
• Timeouts y retries
• Documentación y help message
💭 Conclusión
El error handling robusto es la diferencia entre un script que "funciona en mi máquina" y uno que es confiable en producción. Estos patrones me han salvado de innumerables incidentes. Invierte tiempo en implementarlos correctamente desde el inicio.