Set up automated Consul snapshots with verification, implement disaster recovery procedures, and configure monitoring for production-grade backup strategies that ensure service discovery data protection.
Prerequisites
- Existing Consul cluster installation
- Root or sudo access
- Basic knowledge of systemd and cron
- ACL tokens configured in Consul
What this solves
Consul stores critical service discovery data, health check information, and key-value configuration that applications depend on. Without proper backup and disaster recovery procedures, you risk losing this data during hardware failures, corruption events, or operational mistakes. This tutorial implements automated snapshot creation, backup verification, restoration procedures, and monitoring to protect your Consul cluster data with production-grade reliability.
Step-by-step configuration
Install required packages
Install the necessary tools for backup operations and monitoring.
sudo apt update
sudo apt install -y consul jq curl awscli gzip
Create backup directory structure
Set up directories for storing local snapshots with proper permissions.
sudo mkdir -p /opt/consul/backups/{snapshots,logs,scripts}
sudo mkdir -p /opt/consul/backups/snapshots/{daily,weekly,monthly}
sudo chown -R consul:consul /opt/consul/backups
sudo chmod 750 /opt/consul/backups
Configure Consul ACL for backup operations
Create a dedicated ACL token with minimal permissions for snapshot operations.
node_prefix "" {
policy = "read"
}
service_prefix "" {
policy = "read"
}
key_prefix "" {
policy = "read"
}
session_prefix "" {
policy = "read"
}
operator = "read"
Apply the backup policy
Create the policy and generate a token for backup operations.
consul acl policy create -name "consul-backup" -description "Policy for Consul backup operations" -rules @/opt/consul/backups/scripts/backup-policy.hcl
consul acl token create -description "Backup service token" -policy-name "consul-backup" | grep SecretID | awk '{print $2}' | sudo tee /opt/consul/backups/.backup-token
sudo chown consul:consul /opt/consul/backups/.backup-token
sudo chmod 600 /opt/consul/backups/.backup-token
Create the main backup script
Implement a comprehensive backup script with verification and error handling.
#!/bin/bash
Consul Backup Script with Verification
set -euo pipefail
Configuration
CONSUL_HTTP_ADDR="http://127.0.0.1:8500"
BACKUP_DIR="/opt/consul/backups"
SNAPSHOT_DIR="${BACKUP_DIR}/snapshots"
LOG_DIR="${BACKUP_DIR}/logs"
TOKEN_FILE="${BACKUP_DIR}/.backup-token"
RETENTION_DAYS=30
S3_BUCKET="${S3_BACKUP_BUCKET:-}"
SLACK_WEBHOOK="${SLACK_WEBHOOK_URL:-}"
Logging function
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a "${LOG_DIR}/backup-$(date '+%Y%m%d').log"
}
Error handling
error_exit() {
log "ERROR: $1"
send_alert "Consul backup failed: $1"
exit 1
}
Send alert function
send_alert() {
local message="$1"
if [[ -n "${SLACK_WEBHOOK}" ]]; then
curl -X POST -H 'Content-type: application/json' \
--data '{"text":"'"${message}"'"}' \
"${SLACK_WEBHOOK}" || true
fi
logger -t consul-backup "${message}"
}
Check Consul health
check_consul_health() {
log "Checking Consul cluster health"
local leader_status
leader_status=$(curl -s "${CONSUL_HTTP_ADDR}/v1/status/leader" || echo "")
if [[ -z "${leader_status}" ]] || [[ "${leader_status}" == "\"\"" ]]; then
error_exit "No Consul leader found - cluster may be unhealthy"
fi
local peer_count
peer_count=$(curl -s "${CONSUL_HTTP_ADDR}/v1/status/peers" | jq length)
log "Consul cluster has ${peer_count} peers, leader: ${leader_status}"
}
Create snapshot
create_snapshot() {
local snapshot_type="$1"
local timestamp=$(date '+%Y%m%d_%H%M%S')
local snapshot_file="${SNAPSHOT_DIR}/${snapshot_type}/consul_snapshot_${timestamp}.snap"
log "Creating ${snapshot_type} snapshot: ${snapshot_file}"
# Read backup token
if [[ ! -f "${TOKEN_FILE}" ]]; then
error_exit "Backup token file not found: ${TOKEN_FILE}"
fi
local token
token=$(cat "${TOKEN_FILE}")
# Create snapshot
if ! curl -s -X GET \
-H "X-Consul-Token: ${token}" \
"${CONSUL_HTTP_ADDR}/v1/snapshot" \
-o "${snapshot_file}"; then
error_exit "Failed to create snapshot"
fi
# Verify snapshot file exists and has content
if [[ ! -f "${snapshot_file}" ]] || [[ ! -s "${snapshot_file}" ]]; then
error_exit "Snapshot file is empty or doesn't exist"
fi
local file_size
file_size=$(stat -c%s "${snapshot_file}")
log "Snapshot created successfully: ${file_size} bytes"
# Compress snapshot
gzip "${snapshot_file}"
local compressed_file="${snapshot_file}.gz"
local compressed_size
compressed_size=$(stat -c%s "${compressed_file}")
log "Snapshot compressed: ${compressed_size} bytes"
echo "${compressed_file}"
}
Verify snapshot integrity
verify_snapshot() {
local snapshot_file="$1"
log "Verifying snapshot integrity: ${snapshot_file}"
# Decompress for verification
local temp_file="/tmp/consul_verify_$(basename "${snapshot_file}" .gz)"
gunzip -c "${snapshot_file}" > "${temp_file}"
# Basic file format verification
local file_type
file_type=$(file "${temp_file}")
if [[ ! "${file_type}" =~ "data" ]]; then
rm -f "${temp_file}"
error_exit "Snapshot appears corrupted - unexpected file type: ${file_type}"
fi
rm -f "${temp_file}"
log "Snapshot verification passed"
}
Upload to S3 (optional)
upload_to_s3() {
local snapshot_file="$1"
if [[ -z "${S3_BUCKET}" ]]; then
log "S3 upload skipped - no bucket configured"
return 0
fi
log "Uploading to S3: ${S3_BUCKET}"
local s3_key="consul-backups/$(date '+%Y/%m/%d')/$(basename "${snapshot_file}")"
if aws s3 cp "${snapshot_file}" "s3://${S3_BUCKET}/${s3_key}"; then
log "Successfully uploaded to S3: s3://${S3_BUCKET}/${s3_key}"
else
log "WARNING: Failed to upload to S3"
fi
}
Cleanup old backups
cleanup_old_backups() {
log "Cleaning up backups older than ${RETENTION_DAYS} days"
for backup_type in daily weekly monthly; do
find "${SNAPSHOT_DIR}/${backup_type}" -name "*.snap.gz" -mtime +${RETENTION_DAYS} -delete
done
find "${LOG_DIR}" -name "*.log" -mtime +${RETENTION_DAYS} -delete
log "Cleanup completed"
}
Main execution
main() {
local backup_type="${1:-daily}"
log "Starting Consul backup process (${backup_type})"
# Pre-flight checks
check_consul_health
# Create snapshot
local snapshot_file
snapshot_file=$(create_snapshot "${backup_type}")
# Verify snapshot
verify_snapshot "${snapshot_file}"
# Upload to S3 if configured
upload_to_s3 "${snapshot_file}"
# Cleanup old backups
cleanup_old_backups
log "Backup process completed successfully"
send_alert "Consul backup completed successfully (${backup_type})"
}
Execute main function
main "$@"
Create the restoration script
Implement a safe restoration script with verification steps.
#!/bin/bash
Consul Restore Script with Safety Checks
set -euo pipefail
Configuration
CONSUL_HTTP_ADDR="http://127.0.0.1:8500"
BACKUP_DIR="/opt/consul/backups"
TOKEN_FILE="${BACKUP_DIR}/.backup-token"
Logging function
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1"
}
Error handling
error_exit() {
log "ERROR: $1"
exit 1
}
Verify snapshot before restoration
verify_restore_snapshot() {
local snapshot_file="$1"
if [[ ! -f "${snapshot_file}" ]]; then
error_exit "Snapshot file not found: ${snapshot_file}"
fi
log "Verifying snapshot file: ${snapshot_file}"
# Check if file is compressed
if [[ "${snapshot_file}" =~ \.gz$ ]]; then
if ! gunzip -t "${snapshot_file}"; then
error_exit "Compressed snapshot file is corrupted"
fi
log "Compressed snapshot verification passed"
fi
}
Create pre-restore snapshot
create_pre_restore_backup() {
log "Creating pre-restore backup for safety"
local timestamp=$(date '+%Y%m%d_%H%M%S')
local backup_file="${BACKUP_DIR}/snapshots/pre_restore_${timestamp}.snap"
local token
token=$(cat "${TOKEN_FILE}")
if curl -s -X GET \
-H "X-Consul-Token: ${token}" \
"${CONSUL_HTTP_ADDR}/v1/snapshot" \
-o "${backup_file}"; then
gzip "${backup_file}"
log "Pre-restore backup created: ${backup_file}.gz"
else
log "WARNING: Could not create pre-restore backup"
fi
}
Perform restoration
restore_snapshot() {
local snapshot_file="$1"
local temp_file="/tmp/consul_restore_$(date '+%s').snap"
# Decompress if needed
if [[ "${snapshot_file}" =~ \.gz$ ]]; then
log "Decompressing snapshot for restoration"
gunzip -c "${snapshot_file}" > "${temp_file}"
else
cp "${snapshot_file}" "${temp_file}"
fi
log "Starting Consul snapshot restoration"
local token
token=$(cat "${TOKEN_FILE}")
if curl -s -X PUT \
-H "X-Consul-Token: ${token}" \
--data-binary @"${temp_file}" \
"${CONSUL_HTTP_ADDR}/v1/snapshot"; then
log "Snapshot restoration completed"
else
rm -f "${temp_file}"
error_exit "Snapshot restoration failed"
fi
rm -f "${temp_file}"
}
Verify cluster health after restoration
verify_post_restore() {
log "Verifying cluster health after restoration"
sleep 5
local leader_status
leader_status=$(curl -s "${CONSUL_HTTP_ADDR}/v1/status/leader" || echo "")
if [[ -z "${leader_status}" ]] || [[ "${leader_status}" == "\"\"" ]]; then
error_exit "No leader found after restoration - cluster may be unhealthy"
fi
log "Cluster health verification passed - Leader: ${leader_status}"
}
List available snapshots
list_snapshots() {
log "Available snapshots:"
find "${BACKUP_DIR}/snapshots" -name "*.snap.gz" -type f -exec ls -lh {} \; | sort -k9
}
Main execution
main() {
if [[ $# -eq 0 ]]; then
echo "Usage: $0 | list"
echo "Examples:"
echo " $0 list"
echo " $0 /opt/consul/backups/snapshots/daily/consul_snapshot_20241201_120000.snap.gz"
exit 1
fi
if [[ "$1" == "list" ]]; then
list_snapshots
exit 0
fi
local snapshot_file="$1"
# Safety checks
verify_restore_snapshot "${snapshot_file}"
# Create safety backup
create_pre_restore_backup
# Confirm restoration
echo "WARNING: This will restore Consul data from snapshot: ${snapshot_file}"
echo "All current data will be replaced. Continue? (yes/no)"
read -r confirmation
if [[ "${confirmation}" != "yes" ]]; then
log "Restoration cancelled by user"
exit 0
fi
# Perform restoration
restore_snapshot "${snapshot_file}"
# Verify results
verify_post_restore
log "Restoration completed successfully"
}
Execute main function
main "$@"
Make scripts executable and set permissions
Configure proper ownership and permissions for the backup scripts.
sudo chmod +x /opt/consul/backups/scripts/consul-backup.sh
sudo chmod +x /opt/consul/backups/scripts/consul-restore.sh
sudo chown consul:consul /opt/consul/backups/scripts/*.sh
Configure automated snapshots with systemd
Create systemd service and timer units for automated backup execution.
[Unit]
Description=Consul Backup Service
After=consul.service
Requires=consul.service
[Service]
Type=oneshot
User=consul
Group=consul
Environment="S3_BACKUP_BUCKET=your-consul-backups"
Environment="SLACK_WEBHOOK_URL=https://hooks.slack.com/services/YOUR/WEBHOOK/URL"
ExecStart=/opt/consul/backups/scripts/consul-backup.sh daily
TimeoutStartSec=300
PrivateTmp=true
NoNewPrivileges=true
[Install]
WantedBy=multi-user.target
Create systemd timers for different backup frequencies
Set up daily, weekly, and monthly backup schedules.
[Unit]
Description=Daily Consul Backup Timer
Requires=consul-backup.service
[Timer]
OnCalendar=daily
RandomizedDelaySec=3600
Persistent=true
[Install]
WantedBy=timers.target
Create weekly backup timer
Configure weekly backup schedule for additional retention.
[Unit]
Description=Weekly Consul Backup Timer
Requires=consul-backup.service
[Timer]
OnCalendar=weekly
RandomizedDelaySec=7200
Persistent=true
[Install]
WantedBy=timers.target
Create weekly backup service
Define the weekly backup service with different retention settings.
[Unit]
Description=Weekly Consul Backup Service
After=consul.service
Requires=consul.service
[Service]
Type=oneshot
User=consul
Group=consul
Environment="S3_BACKUP_BUCKET=your-consul-backups"
Environment="SLACK_WEBHOOK_URL=https://hooks.slack.com/services/YOUR/WEBHOOK/URL"
ExecStart=/opt/consul/backups/scripts/consul-backup.sh weekly
TimeoutStartSec=300
PrivateTmp=true
NoNewPrivileges=true
Enable and start backup timers
Activate the systemd timers to begin automated backups.
sudo systemctl daemon-reload
sudo systemctl enable consul-backup-daily.timer
sudo systemctl enable consul-backup-weekly.timer
sudo systemctl start consul-backup-daily.timer
sudo systemctl start consul-backup-weekly.timer
Create backup monitoring script
Implement monitoring to verify backup health and alert on failures.
#!/bin/bash
Consul Backup Monitoring Script
set -euo pipefail
BACKUP_DIR="/opt/consul/backups"
SNAPSHOT_DIR="${BACKUP_DIR}/snapshots"
ALERT_THRESHOLD_HOURS=25
SLACK_WEBHOOK="${SLACK_WEBHOOK_URL:-}"
Send alert function
send_alert() {
local message="$1"
local status="$2"
if [[ -n "${SLACK_WEBHOOK}" ]]; then
local color="good"
[[ "${status}" == "error" ]] && color="danger"
[[ "${status}" == "warning" ]] && color="warning"
curl -X POST -H 'Content-type: application/json' \
--data '{
"attachments": [{
"color": "'"${color}"'",
"title": "Consul Backup Monitor",
"text": "'"${message}"'",
"ts": '$(date +%s)'
}]
}' \
"${SLACK_WEBHOOK}" >/dev/null 2>&1 || true
fi
logger -t consul-backup-monitor "${message}"
echo "$(date '+%Y-%m-%d %H:%M:%S') - ${message}"
}
Check backup freshness
check_backup_freshness() {
local latest_backup
latest_backup=$(find "${SNAPSHOT_DIR}/daily" -name "*.snap.gz" -type f -printf '%T@ %p\n' 2>/dev/null | sort -n | tail -1 | cut -d' ' -f2-)
if [[ -z "${latest_backup}" ]]; then
send_alert "ERROR: No daily backups found in ${SNAPSHOT_DIR}/daily" "error"
return 1
fi
local backup_age_hours
backup_age_hours=$(( ($(date +%s) - $(stat -c %Y "${latest_backup}")) / 3600 ))
if [[ ${backup_age_hours} -gt ${ALERT_THRESHOLD_HOURS} ]]; then
send_alert "WARNING: Latest backup is ${backup_age_hours} hours old (threshold: ${ALERT_THRESHOLD_HOURS}h)" "warning"
return 1
fi
send_alert "OK: Latest backup is ${backup_age_hours} hours old" "good"
return 0
}
Check backup sizes
check_backup_sizes() {
local recent_backups
recent_backups=$(find "${SNAPSHOT_DIR}/daily" -name "*.snap.gz" -type f -mtime -7 -exec ls -l {} \; | awk '{print $5}' | sort -n)
if [[ -z "${recent_backups}" ]]; then
send_alert "WARNING: No recent backups found for size analysis" "warning"
return 1
fi
local avg_size
avg_size=$(echo "${recent_backups}" | awk '{sum+=$1} END {print sum/NR}')
local latest_size
latest_size=$(echo "${recent_backups}" | tail -1)
# Alert if latest backup is 50% smaller than average (potential corruption)
local min_expected_size
min_expected_size=$(echo "${avg_size} * 0.5" | bc -l | cut -d. -f1)
if [[ ${latest_size} -lt ${min_expected_size} ]]; then
send_alert "WARNING: Latest backup size (${latest_size} bytes) is significantly smaller than average (${avg_size} bytes)" "warning"
return 1
fi
send_alert "OK: Backup size validation passed (${latest_size} bytes)" "good"
return 0
}
Check systemd service status
check_service_status() {
if systemctl is-enabled consul-backup-daily.timer >/dev/null 2>&1; then
if systemctl is-active consul-backup-daily.timer >/dev/null 2>&1; then
send_alert "OK: Backup timer is active and enabled" "good"
else
send_alert "ERROR: Backup timer is enabled but not active" "error"
return 1
fi
else
send_alert "ERROR: Backup timer is not enabled" "error"
return 1
fi
return 0
}
Main execution
main() {
echo "Starting Consul backup monitoring..."
local exit_code=0
check_backup_freshness || exit_code=1
check_backup_sizes || exit_code=1
check_service_status || exit_code=1
if [[ ${exit_code} -eq 0 ]]; then
echo "All backup health checks passed"
else
echo "Some backup health checks failed"
fi
exit ${exit_code}
}
Execute main function
main "$@"
Make monitoring script executable and schedule it
Configure the monitoring script and set up regular health checks.
sudo chmod +x /opt/consul/backups/scripts/backup-monitor.sh
sudo chown consul:consul /opt/consul/backups/scripts/backup-monitor.sh
Configure monitoring cron job
Set up automated monitoring checks using cron as explained in our cron configuration guide.
sudo -u consul crontab -e
Add the monitoring schedule:
# Check backup health every 4 hours
0 /4 /opt/consul/backups/scripts/backup-monitor.sh >/dev/null 2>&1
Test the backup system
Execute a manual backup to verify the system is working correctly.
sudo -u consul /opt/consul/backups/scripts/consul-backup.sh daily
sudo systemctl status consul-backup-daily.timer
sudo journalctl -u consul-backup.service -n 20
Verify your backup setup
Check that automated snapshots are working correctly:
# Verify timer status
sudo systemctl list-timers | grep consul-backup
Check recent backup files
ls -la /opt/consul/backups/snapshots/daily/
Test backup integrity
sudo -u consul /opt/consul/backups/scripts/backup-monitor.sh
List available snapshots for restoration
sudo -u consul /opt/consul/backups/scripts/consul-restore.sh list
Check Consul cluster status
consul members
consul operator raft list-peers
Verify backup logs and monitoring:
# Check backup logs
tail -f /opt/consul/backups/logs/backup-$(date '+%Y%m%d').log
Monitor systemd journal for backup events
journalctl -u consul-backup.service -f
Test restoration process (use with caution)
sudo -u consul /opt/consul/backups/scripts/consul-restore.sh /path/to/snapshot.gz
Common issues
| Symptom | Cause | Fix |
|---|---|---|
| Permission denied creating snapshots | Insufficient ACL permissions | Verify backup token has operator=read policy |
| Backup script fails with 403 | Invalid or expired ACL token | Regenerate backup token: consul acl token create -policy-name consul-backup |
| Snapshots are empty or corrupted | Consul cluster unhealthy during backup | Check cluster health: consul operator raft list-peers |
| Timer not running backups | Systemd timer not active | sudo systemctl start consul-backup-daily.timer |
| S3 upload fails | AWS credentials not configured | Configure AWS CLI: aws configure or use IAM roles |
| Restoration fails with leader election | Multiple nodes restoring simultaneously | Restore on leader node first, wait for cluster convergence |
| High disk usage in backup directory | Retention cleanup not working | Check retention settings and run manual cleanup |
| Monitoring alerts not sent | Webhook URL misconfigured | Test webhook: curl -X POST webhook_url -d '{"text":"test"}' |
Next steps
- Configure Consul Connect service mesh with Envoy proxy for enhanced security
- Set up monitoring for Consul cluster with Prometheus and Grafana
- Configure Consul multi-datacenter WAN federation for geographic redundancy
- Implement encryption for Consul backups using GPG
- Configure advanced Consul ACL policies for production security
Automated install script
Run this to automate the entire setup
#!/usr/bin/env bash
set -euo pipefail
# Consul Backup and Disaster Recovery Installation Script
# Configures automated snapshots, restoration procedures, and monitoring
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# Configuration
CONSUL_USER="consul"
BACKUP_DIR="/opt/consul/backups"
S3_BUCKET="${1:-}"
SLACK_WEBHOOK="${2:-}"
# Function to print colored output
log() {
echo -e "${GREEN}[$(date '+%Y-%m-%d %H:%M:%S')] $1${NC}"
}
warn() {
echo -e "${YELLOW}[$(date '+%Y-%m-%d %H:%M:%S')] WARNING: $1${NC}"
}
error() {
echo -e "${RED}[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: $1${NC}" >&2
}
usage() {
echo "Usage: $0 [S3_BUCKET] [SLACK_WEBHOOK_URL]"
echo " S3_BUCKET: Optional S3 bucket for remote backups"
echo " SLACK_WEBHOOK_URL: Optional Slack webhook for alerts"
exit 1
}
# Cleanup on error
cleanup() {
error "Installation failed. Cleaning up..."
systemctl stop consul-backup.timer 2>/dev/null || true
systemctl disable consul-backup.timer 2>/dev/null || true
rm -f /etc/systemd/system/consul-backup.{service,timer}
systemctl daemon-reload
}
trap cleanup ERR
# Check if running as root
if [[ $EUID -ne 0 ]]; then
error "This script must be run as root"
exit 1
fi
# Detect distribution and set package manager
if [ -f /etc/os-release ]; then
. /etc/os-release
case "$ID" in
ubuntu|debian)
PKG_MGR="apt"
PKG_UPDATE="apt update"
PKG_INSTALL="apt install -y"
;;
almalinux|rocky|centos|rhel|ol|fedora)
PKG_MGR="dnf"
PKG_UPDATE="dnf update -y"
PKG_INSTALL="dnf install -y"
;;
amzn)
PKG_MGR="yum"
PKG_UPDATE="yum update -y"
PKG_INSTALL="yum install -y"
;;
*)
error "Unsupported distribution: $ID"
exit 1
;;
esac
else
error "Cannot detect distribution"
exit 1
fi
echo "[1/8] Installing required packages..."
$PKG_UPDATE
$PKG_INSTALL consul jq curl awscli gzip
log "Required packages installed successfully"
echo "[2/8] Creating backup directory structure..."
mkdir -p $BACKUP_DIR/{snapshots,logs,scripts}
mkdir -p $BACKUP_DIR/snapshots/{daily,weekly,monthly}
# Create consul user if it doesn't exist
if ! id "$CONSUL_USER" &>/dev/null; then
useradd -r -d /opt/consul -s /bin/false $CONSUL_USER
fi
chown -R $CONSUL_USER:$CONSUL_USER $BACKUP_DIR
chmod 750 $BACKUP_DIR
log "Backup directory structure created"
echo "[3/8] Creating ACL backup policy..."
cat > $BACKUP_DIR/scripts/backup-policy.hcl << 'EOF'
node_prefix "" {
policy = "read"
}
service_prefix "" {
policy = "read"
}
key_prefix "" {
policy = "read"
}
session_prefix "" {
policy = "read"
}
operator = "read"
EOF
chmod 644 $BACKUP_DIR/scripts/backup-policy.hcl
chown $CONSUL_USER:$CONSUL_USER $BACKUP_DIR/scripts/backup-policy.hcl
log "ACL backup policy created"
echo "[4/8] Creating main backup script..."
cat > $BACKUP_DIR/scripts/consul-backup.sh << 'EOF'
#!/bin/bash
set -euo pipefail
# Configuration
CONSUL_HTTP_ADDR="http://127.0.0.1:8500"
BACKUP_DIR="/opt/consul/backups"
SNAPSHOT_DIR="${BACKUP_DIR}/snapshots"
LOG_DIR="${BACKUP_DIR}/logs"
TOKEN_FILE="${BACKUP_DIR}/.backup-token"
RETENTION_DAYS=30
S3_BUCKET="${S3_BACKUP_BUCKET:-}"
SLACK_WEBHOOK="${SLACK_WEBHOOK_URL:-}"
# Logging function
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a "${LOG_DIR}/backup-$(date '+%Y%m%d').log"
}
# Error handling
error_exit() {
log "ERROR: $1"
send_alert "Consul backup failed: $1"
exit 1
}
# Send alert function
send_alert() {
local message="$1"
if [[ -n "${SLACK_WEBHOOK}" ]]; then
curl -X POST -H 'Content-type: application/json' \
--data '{"text":"'"${message}"'"}' \
"${SLACK_WEBHOOK}" || true
fi
logger -t consul-backup "${message}"
}
# Check Consul health
check_consul_health() {
log "Checking Consul cluster health"
local leader_status
leader_status=$(curl -s "${CONSUL_HTTP_ADDR}/v1/status/leader" || echo "")
if [[ -z "${leader_status}" ]] || [[ "${leader_status}" == "\"\"" ]]; then
error_exit "No Consul leader found - cluster may be unhealthy"
fi
local peer_count
peer_count=$(curl -s "${CONSUL_HTTP_ADDR}/v1/status/peers" | jq length)
log "Consul cluster has ${peer_count} peers, leader: ${leader_status}"
}
# Create snapshot
create_snapshot() {
local snapshot_type="$1"
local timestamp=$(date '+%Y%m%d_%H%M%S')
local snapshot_file="${SNAPSHOT_DIR}/${snapshot_type}/consul_snapshot_${timestamp}.snap"
log "Creating ${snapshot_type} snapshot: ${snapshot_file}"
if [[ ! -f "${TOKEN_FILE}" ]]; then
error_exit "Backup token file not found: ${TOKEN_FILE}"
fi
local token
token=$(cat "${TOKEN_FILE}")
if ! curl -s -X GET \
-H "X-Consul-Token: ${token}" \
"${CONSUL_HTTP_ADDR}/v1/snapshot" \
-o "${snapshot_file}"; then
error_exit "Failed to create snapshot"
fi
gzip "${snapshot_file}"
snapshot_file="${snapshot_file}.gz"
if [[ ! -f "${snapshot_file}" ]]; then
error_exit "Snapshot file was not created"
fi
log "Snapshot created successfully: ${snapshot_file}"
echo "${snapshot_file}"
}
# Upload to S3
upload_to_s3() {
local snapshot_file="$1"
if [[ -n "${S3_BUCKET}" ]]; then
log "Uploading snapshot to S3: ${S3_BUCKET}"
local s3_key="consul-snapshots/$(basename "${snapshot_file}")"
if aws s3 cp "${snapshot_file}" "s3://${S3_BUCKET}/${s3_key}"; then
log "Successfully uploaded to S3: s3://${S3_BUCKET}/${s3_key}"
else
error_exit "Failed to upload snapshot to S3"
fi
fi
}
# Cleanup old snapshots
cleanup_old_snapshots() {
log "Cleaning up snapshots older than ${RETENTION_DAYS} days"
find "${SNAPSHOT_DIR}" -name "*.snap.gz" -mtime +${RETENTION_DAYS} -delete
log "Cleanup completed"
}
# Main execution
main() {
local backup_type="${1:-daily}"
log "Starting Consul backup process (${backup_type})"
check_consul_health
local snapshot_file
snapshot_file=$(create_snapshot "${backup_type}")
upload_to_s3 "${snapshot_file}"
cleanup_old_snapshots
log "Backup process completed successfully"
send_alert "Consul backup completed successfully: $(basename "${snapshot_file}")"
}
main "$@"
EOF
chmod 755 $BACKUP_DIR/scripts/consul-backup.sh
chown $CONSUL_USER:$CONSUL_USER $BACKUP_DIR/scripts/consul-backup.sh
log "Main backup script created"
echo "[5/8] Creating restore script..."
cat > $BACKUP_DIR/scripts/consul-restore.sh << 'EOF'
#!/bin/bash
set -euo pipefail
CONSUL_HTTP_ADDR="http://127.0.0.1:8500"
TOKEN_FILE="/opt/consul/backups/.backup-token"
if [[ $# -ne 1 ]]; then
echo "Usage: $0 <snapshot-file>"
exit 1
fi
SNAPSHOT_FILE="$1"
if [[ ! -f "$SNAPSHOT_FILE" ]]; then
echo "Error: Snapshot file not found: $SNAPSHOT_FILE"
exit 1
fi
echo "WARNING: This will restore Consul data and may overwrite existing data."
read -p "Are you sure you want to continue? (yes/no): " confirm
if [[ "$confirm" != "yes" ]]; then
echo "Restore cancelled"
exit 0
fi
if [[ "$SNAPSHOT_FILE" == *.gz ]]; then
temp_file=$(mktemp)
gunzip -c "$SNAPSHOT_FILE" > "$temp_file"
SNAPSHOT_FILE="$temp_file"
fi
token=$(cat "$TOKEN_FILE")
echo "Restoring from snapshot: $SNAPSHOT_FILE"
if curl -X PUT -H "X-Consul-Token: $token" \
--data-binary "@$SNAPSHOT_FILE" \
"$CONSUL_HTTP_ADDR/v1/snapshot"; then
echo "Restore completed successfully"
else
echo "Restore failed"
exit 1
fi
if [[ -n "${temp_file:-}" ]]; then
rm -f "$temp_file"
fi
EOF
chmod 755 $BACKUP_DIR/scripts/consul-restore.sh
chown $CONSUL_USER:$CONSUL_USER $BACKUP_DIR/scripts/consul-restore.sh
log "Restore script created"
echo "[6/8] Creating systemd service and timer..."
cat > /etc/systemd/system/consul-backup.service << EOF
[Unit]
Description=Consul Backup Service
After=consul.service
[Service]
Type=oneshot
User=$CONSUL_USER
Group=$CONSUL_USER
ExecStart=$BACKUP_DIR/scripts/consul-backup.sh daily
Environment=S3_BACKUP_BUCKET=$S3_BUCKET
Environment=SLACK_WEBHOOK_URL=$SLACK_WEBHOOK
EOF
cat > /etc/systemd/system/consul-backup.timer << 'EOF'
[Unit]
Description=Run Consul backup daily
Requires=consul-backup.service
[Timer]
OnCalendar=daily
Persistent=true
RandomizedDelaySec=1800
[Install]
WantedBy=timers.target
EOF
systemctl daemon-reload
systemctl enable consul-backup.timer
systemctl start consul-backup.timer
log "Systemd service and timer configured"
echo "[7/8] Creating token placeholder..."
touch $BACKUP_DIR/.backup-token
chown $CONSUL_USER:$CONSUL_USER $BACKUP_DIR/.backup-token
chmod 600 $BACKUP_DIR/.backup-token
log "Token placeholder created"
echo "[8/8] Verifying installation..."
if systemctl is-active --quiet consul-backup.timer; then
log "Consul backup timer is active"
else
warn "Consul backup timer is not active"
fi
if [[ -d "$BACKUP_DIR" ]] && [[ -x "$BACKUP_DIR/scripts/consul-backup.sh" ]]; then
log "Backup scripts are properly installed"
else
error "Backup scripts installation verification failed"
exit 1
fi
log "Installation completed successfully!"
echo
echo "Next steps:"
echo "1. Configure Consul ACL policy: consul acl policy create -name 'consul-backup' -description 'Policy for Consul backup operations' -rules @$BACKUP_DIR/scripts/backup-policy.hcl"
echo "2. Create ACL token: consul acl token create -description 'Backup service token' -policy-name 'consul-backup'"
echo "3. Save the token SecretID to: $BACKUP_DIR/.backup-token"
echo "4. Test backup: sudo -u $CONSUL_USER $BACKUP_DIR/scripts/consul-backup.sh"
Review the script before running. Execute with: bash install.sh