Configure automated ScyllaDB backups with nodetool snapshots, implement validation scripts to verify backup integrity, and set up systemd timers for automated restore testing with Prometheus monitoring.
Prerequisites
- ScyllaDB cluster running
- Root access
- Python 3 with pip
- At least 10GB free disk space
What this solves
ScyllaDB backup validation ensures your snapshots are recoverable when disasters strike. This tutorial sets up automated backup creation with nodetool, validates backup integrity through checksums and metadata verification, and implements automated restore testing to catch corruption before you need the backups. You'll also configure Prometheus monitoring to track backup health and receive alerts when validation fails.
Step-by-step configuration
Install backup validation dependencies
Install required tools for backup validation, compression, and monitoring.
sudo apt update
sudo apt install -y python3 python3-pip jq pigz parallel curl
Create backup directory structure
Set up organized directories for backups, validation logs, and restore testing.
sudo mkdir -p /opt/scylladb-backup/{snapshots,validation,restore-test,scripts,logs}
sudo chown -R scylla:scylla /opt/scylladb-backup
sudo chmod -R 755 /opt/scylladb-backup
Configure backup automation script
Create the main backup script that handles snapshot creation, validation, and cleanup.
#!/bin/bash
ScyllaDB Backup and Validation Script
set -euo pipefail
Configuration
BACKUP_DIR="/opt/scylladb-backup"
SNAPSHOT_DIR="$BACKUP_DIR/snapshots"
VALIDATION_DIR="$BACKUP_DIR/validation"
LOG_DIR="$BACKUP_DIR/logs"
KEYSPACES="${SCYLLA_KEYSPACES:-system_schema}"
RETENTION_DAYS=${BACKUP_RETENTION_DAYS:-7}
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
SNAPSHOT_TAG="backup_$TIMESTAMP"
LOG_FILE="$LOG_DIR/backup_$TIMESTAMP.log"
Prometheus metrics file
METRICS_FILE="/var/lib/node_exporter/textfile_collector/scylladb_backup.prom"
Logging function
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') $1" | tee -a "$LOG_FILE"
}
Create snapshot
create_snapshot() {
log "Creating snapshot: $SNAPSHOT_TAG"
# Clear any existing snapshots
nodetool clearsnapshot 2>/dev/null || true
# Create new snapshot
if [ "$KEYSPACES" = "all" ]; then
nodetool snapshot -t "$SNAPSHOT_TAG"
else
for ks in $KEYSPACES; do
nodetool snapshot -t "$SNAPSHOT_TAG" "$ks"
done
fi
log "Snapshot created successfully"
}
Copy and compress snapshot data
copy_snapshot() {
log "Copying snapshot data to backup directory"
local snapshot_backup_dir="$SNAPSHOT_DIR/$TIMESTAMP"
mkdir -p "$snapshot_backup_dir"
# Find and copy snapshot files
find /var/lib/scylla/data -name "$SNAPSHOT_TAG" -type d | while read -r snap_dir; do
# Extract keyspace and table from path
local rel_path=$(echo "$snap_dir" | sed "s|/var/lib/scylla/data/||")
local dest_dir="$snapshot_backup_dir/$rel_path"
mkdir -p "$(dirname "$dest_dir")"
# Copy with compression
log "Copying $snap_dir to $dest_dir"
tar -I pigz -cf "${dest_dir}.tar.gz" -C "$(dirname "$snap_dir")" "$(basename "$snap_dir")"
done
# Create metadata file
create_metadata "$snapshot_backup_dir"
log "Snapshot copy completed"
}
Create backup metadata
create_metadata() {
local backup_dir="$1"
local metadata_file="$backup_dir/metadata.json"
log "Creating backup metadata"
# Get cluster information
local cluster_name=$(nodetool describecluster | grep "Name:" | awk '{print $2}')
local node_id=$(nodetool info | grep "ID" | awk '{print $2}')
local datacenter=$(nodetool status | grep "$(hostname -I | awk '{print $1}')" | awk '{print $2}')
# Calculate checksums
local checksums_file="$backup_dir/checksums.sha256"
find "$backup_dir" -name "*.tar.gz" -exec sha256sum {} + > "$checksums_file"
# Create JSON metadata
cat > "$metadata_file" << EOF
{
"timestamp": "$TIMESTAMP",
"snapshot_tag": "$SNAPSHOT_TAG",
"cluster_name": "$cluster_name",
"node_id": "$node_id",
"datacenter": "$datacenter",
"keyspaces": "$KEYSPACES",
"backup_size_bytes": $(du -sb "$backup_dir" | awk '{print $1}'),
"file_count": $(find "$backup_dir" -name "*.tar.gz" | wc -l),
"checksums_file": "checksums.sha256",
"node_hostname": "$(hostname)",
"scylla_version": "$(scylla --version | head -1)"
}
EOF
log "Metadata created: $metadata_file"
}
Validate backup integrity
validate_backup() {
local backup_dir="$SNAPSHOT_DIR/$TIMESTAMP"
local validation_log="$VALIDATION_DIR/validation_$TIMESTAMP.log"
local validation_status=0
log "Validating backup integrity"
# Check metadata exists
if [ ! -f "$backup_dir/metadata.json" ]; then
echo "FAIL: metadata.json missing" >> "$validation_log"
validation_status=1
else
echo "PASS: metadata.json exists" >> "$validation_log"
fi
# Verify checksums
if [ -f "$backup_dir/checksums.sha256" ]; then
cd "$backup_dir"
if sha256sum -c checksums.sha256 >> "$validation_log" 2>&1; then
echo "PASS: All checksums verified" >> "$validation_log"
else
echo "FAIL: Checksum verification failed" >> "$validation_log"
validation_status=1
fi
else
echo "FAIL: checksums.sha256 missing" >> "$validation_log"
validation_status=1
fi
# Test archive extraction
local test_extract_dir="$VALIDATION_DIR/extract_test_$TIMESTAMP"
mkdir -p "$test_extract_dir"
find "$backup_dir" -name "*.tar.gz" | head -3 | while read -r archive; do
if tar -tzf "$archive" > /dev/null 2>&1; then
echo "PASS: Archive readable - $(basename "$archive")" >> "$validation_log"
else
echo "FAIL: Archive corrupted - $(basename "$archive")" >> "$validation_log"
validation_status=1
fi
done
rm -rf "$test_extract_dir"
# Update metrics
update_prometheus_metrics "$validation_status"
if [ $validation_status -eq 0 ]; then
log "Backup validation PASSED"
else
log "Backup validation FAILED - check $validation_log"
exit 1
fi
}
Update Prometheus metrics
update_prometheus_metrics() {
local validation_status="$1"
local backup_dir="$SNAPSHOT_DIR/$TIMESTAMP"
if [ -f "$backup_dir/metadata.json" ]; then
local backup_size=$(jq -r '.backup_size_bytes' "$backup_dir/metadata.json")
local file_count=$(jq -r '.file_count' "$backup_dir/metadata.json")
else
local backup_size=0
local file_count=0
fi
cat > "$METRICS_FILE" << EOF
HELP scylladb_backup_last_success_timestamp Last successful backup timestamp
TYPE scylladb_backup_last_success_timestamp gauge
scylladb_backup_last_success_timestamp $(date +%s)
HELP scylladb_backup_size_bytes Size of last backup in bytes
TYPE scylladb_backup_size_bytes gauge
scylladb_backup_size_bytes $backup_size
HELP scylladb_backup_file_count Number of files in last backup
TYPE scylladb_backup_file_count gauge
scylladb_backup_file_count $file_count
HELP scylladb_backup_validation_status Last backup validation status (0=success, 1=failure)
TYPE scylladb_backup_validation_status gauge
scylladb_backup_validation_status $validation_status
EOF
}
Clean old backups
cleanup_old_backups() {
log "Cleaning up backups older than $RETENTION_DAYS days"
find "$SNAPSHOT_DIR" -type d -name "[0-9]*" -mtime +$RETENTION_DAYS -exec rm -rf {} + 2>/dev/null || true
find "$VALIDATION_DIR" -name "*.log" -mtime +$RETENTION_DAYS -delete 2>/dev/null || true
find "$LOG_DIR" -name "*.log" -mtime +$RETENTION_DAYS -delete 2>/dev/null || true
log "Cleanup completed"
}
Main execution
main() {
log "Starting ScyllaDB backup process"
create_snapshot
copy_snapshot
validate_backup
cleanup_old_backups
# Clear snapshot from ScyllaDB
nodetool clearsnapshot "$SNAPSHOT_TAG" 2>/dev/null || true
log "Backup process completed successfully"
}
main "$@"
sudo chmod +x /opt/scylladb-backup/scripts/backup.sh
sudo chown scylla:scylla /opt/scylladb-backup/scripts/backup.sh
Create restore testing script
Build an automated restore test script that validates backup recoverability.
#!/bin/bash
ScyllaDB Restore Testing Script
set -euo pipefail
Configuration
BACKUP_DIR="/opt/scylladb-backup"
SNAPSHOT_DIR="$BACKUP_DIR/snapshots"
RESTORE_TEST_DIR="$BACKUP_DIR/restore-test"
LOG_DIR="$BACKUP_DIR/logs"
TEST_KEYSPACE="backup_test_ks"
TEST_TABLE="backup_test_table"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
LOG_FILE="$LOG_DIR/restore_test_$TIMESTAMP.log"
Prometheus metrics
METRICS_FILE="/var/lib/node_exporter/textfile_collector/scylladb_restore_test.prom"
Logging function
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') $1" | tee -a "$LOG_FILE"
}
Find latest backup
find_latest_backup() {
local latest_backup=$(find "$SNAPSHOT_DIR" -type d -name "[0-9]*" | sort -r | head -1)
if [ -z "$latest_backup" ]; then
log "ERROR: No backups found in $SNAPSHOT_DIR"
exit 1
fi
echo "$latest_backup"
}
Create test keyspace and data
create_test_data() {
log "Creating test keyspace and data"
# Drop existing test keyspace if exists
cqlsh -e "DROP KEYSPACE IF EXISTS $TEST_KEYSPACE;" 2>/dev/null || true
# Create test keyspace
cqlsh -e "CREATE KEYSPACE $TEST_KEYSPACE WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1};"
# Create test table with sample data
cqlsh -k "$TEST_KEYSPACE" -e "
CREATE TABLE $TEST_TABLE (
id UUID PRIMARY KEY,
name TEXT,
created_at TIMESTAMP,
data BLOB
);
INSERT INTO $TEST_TABLE (id, name, created_at, data)
VALUES (uuid(), 'test_record_1', toTimestamp(now()), 0x123456789abcdef);
INSERT INTO $TEST_TABLE (id, name, created_at, data)
VALUES (uuid(), 'test_record_2', toTimestamp(now()), 0xfedcba987654321);
INSERT INTO $TEST_TABLE (id, name, created_at, data)
VALUES (uuid(), 'test_record_3', toTimestamp(now()), 0x1a2b3c4d5e6f);
"
log "Test data created successfully"
}
Get test data checksum
get_test_data_checksum() {
local checksum=$(cqlsh -k "$TEST_KEYSPACE" -e "SELECT * FROM $TEST_TABLE;" | md5sum | awk '{print $1}')
echo "$checksum"
}
Perform snapshot of test data
snapshot_test_data() {
local snapshot_tag="restore_test_$TIMESTAMP"
log "Creating snapshot of test data: $snapshot_tag"
nodetool snapshot -t "$snapshot_tag" "$TEST_KEYSPACE"
echo "$snapshot_tag"
}
Extract backup for testing
extract_backup() {
local backup_dir="$1"
local extract_dir="$RESTORE_TEST_DIR/extract_$TIMESTAMP"
log "Extracting backup to $extract_dir"
mkdir -p "$extract_dir"
# Extract all archives from the backup
find "$backup_dir" -name "*.tar.gz" | while read -r archive; do
local rel_path=$(echo "$archive" | sed "s|$backup_dir/||" | sed 's|\.tar\.gz$||')
local dest_dir="$extract_dir/$rel_path"
mkdir -p "$(dirname "$dest_dir")"
tar -I pigz -xf "$archive" -C "$(dirname "$dest_dir")"
done
echo "$extract_dir"
}
Restore keyspace from backup
restore_from_backup() {
local extract_dir="$1"
local target_keyspace="${TEST_KEYSPACE}_restored"
log "Restoring keyspace as $target_keyspace"
# Create target keyspace
cqlsh -e "DROP KEYSPACE IF EXISTS $target_keyspace;" 2>/dev/null || true
cqlsh -e "CREATE KEYSPACE $target_keyspace WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1};"
# Find snapshot data for our test keyspace
local snapshot_path=$(find "$extract_dir" -path "/$TEST_KEYSPACE/" -name "backup_*" | head -1)
if [ -z "$snapshot_path" ]; then
log "ERROR: No snapshot data found for $TEST_KEYSPACE"
return 1
fi
# Copy snapshot files to target keyspace data directory
local target_data_dir="/var/lib/scylla/data/$target_keyspace"
sudo systemctl stop scylla-server
# Create target directory structure
find "$snapshot_path" -name "*.db" | while read -r db_file; do
local table_dir=$(dirname "$db_file" | sed "s|./$TEST_KEYSPACE/|$target_data_dir/|" | sed 's|/backup_[0-9_]$||')
sudo mkdir -p "$table_dir"
sudo cp "$db_file" "$table_dir/"
sudo chown scylla:scylla "$table_dir"/*.db
done
sudo systemctl start scylla-server
# Wait for ScyllaDB to start
local retries=30
while ! nodetool status >/dev/null 2>&1 && [ $retries -gt 0 ]; do
log "Waiting for ScyllaDB to start... ($retries retries left)"
sleep 10
retries=$((retries - 1))
done
if [ $retries -eq 0 ]; then
log "ERROR: ScyllaDB failed to start after restore"
return 1
fi
log "Restore completed for keyspace $target_keyspace"
}
Verify restored data
verify_restored_data() {
local target_keyspace="${TEST_KEYSPACE}_restored"
local original_checksum="$1"
log "Verifying restored data integrity"
# Get restored data checksum
local restored_checksum=$(cqlsh -k "$target_keyspace" -e "SELECT * FROM $TEST_TABLE;" | md5sum | awk '{print $1}')
if [ "$original_checksum" = "$restored_checksum" ]; then
log "SUCCESS: Restored data matches original (checksum: $restored_checksum)"
return 0
else
log "ERROR: Restored data does not match original"
log "Original checksum: $original_checksum"
log "Restored checksum: $restored_checksum"
return 1
fi
}
Update Prometheus metrics
update_restore_metrics() {
local test_status="$1"
local test_duration="$2"
cat > "$METRICS_FILE" << EOF
HELP scylladb_restore_test_last_run_timestamp Last restore test timestamp
TYPE scylladb_restore_test_last_run_timestamp gauge
scylladb_restore_test_last_run_timestamp $(date +%s)
HELP scylladb_restore_test_status Last restore test status (0=success, 1=failure)
TYPE scylladb_restore_test_status gauge
scylladb_restore_test_status $test_status
HELP scylladb_restore_test_duration_seconds Duration of last restore test
TYPE scylladb_restore_test_duration_seconds gauge
scylladb_restore_test_duration_seconds $test_duration
EOF
}
Cleanup test data
cleanup_test_data() {
log "Cleaning up test data"
# Drop test keyspaces
cqlsh -e "DROP KEYSPACE IF EXISTS $TEST_KEYSPACE;" 2>/dev/null || true
cqlsh -e "DROP KEYSPACE IF EXISTS ${TEST_KEYSPACE}_restored;" 2>/dev/null || true
# Clean snapshot
nodetool clearsnapshot 2>/dev/null || true
# Remove extract directory
rm -rf "$RESTORE_TEST_DIR/extract_$TIMESTAMP" 2>/dev/null || true
log "Cleanup completed"
}
Main execution
main() {
local start_time=$(date +%s)
local test_status=0
log "Starting ScyllaDB restore test"
# Find latest backup
local backup_dir=$(find_latest_backup)
log "Using backup: $backup_dir"
# Create test data and get checksum
create_test_data
local original_checksum=$(get_test_data_checksum)
log "Original data checksum: $original_checksum"
# Snapshot test data (for comparison)
local test_snapshot=$(snapshot_test_data)
# Extract backup
local extract_dir=$(extract_backup "$backup_dir")
# Restore and verify
if restore_from_backup "$extract_dir" && verify_restored_data "$original_checksum"; then
log "Restore test PASSED"
test_status=0
else
log "Restore test FAILED"
test_status=1
fi
# Calculate duration and update metrics
local end_time=$(date +%s)
local duration=$((end_time - start_time))
update_restore_metrics "$test_status" "$duration"
# Cleanup
cleanup_test_data
log "Restore test completed in ${duration}s with status $test_status"
exit $test_status
}
main "$@"
sudo chmod +x /opt/scylladb-backup/scripts/restore-test.sh
sudo chown scylla:scylla /opt/scylladb-backup/scripts/restore-test.sh
Configure systemd timers for automation
Set up systemd services and timers for automated backup and restore testing.
[Unit]
Description=ScyllaDB Backup Service
After=scylla-server.service
Requires=scylla-server.service
[Service]
Type=oneshot
User=scylla
Group=scylla
ExecStart=/opt/scylladb-backup/scripts/backup.sh
Environment=SCYLLA_KEYSPACES=all
Environment=BACKUP_RETENTION_DAYS=7
StandardOutput=journal
StandardError=journal
[Install]
WantedBy=multi-user.target
[Unit]
Description=ScyllaDB Backup Timer
Requires=scylladb-backup.service
[Timer]
OnCalendar=daily
RandomizedDelaySec=30m
Persistent=true
[Install]
WantedBy=timers.target
[Unit]
Description=ScyllaDB Restore Test Service
After=scylla-server.service
Requires=scylla-server.service
[Service]
Type=oneshot
User=scylla
Group=scylla
ExecStart=/opt/scylladb-backup/scripts/restore-test.sh
StandardOutput=journal
StandardError=journal
TimeoutStartSec=3600
[Install]
WantedBy=multi-user.target
[Unit]
Description=ScyllaDB Restore Test Timer
Requires=scylladb-restore-test.service
[Timer]
OnCalendar=weekly
RandomizedDelaySec=1h
Persistent=true
[Install]
WantedBy=timers.target
Install and configure node_exporter for metrics
Set up node_exporter to expose backup metrics to Prometheus.
wget https://github.com/prometheus/node_exporter/releases/download/v1.7.0/node_exporter-1.7.0.linux-amd64.tar.gz
tar xzf node_exporter-1.7.0.linux-amd64.tar.gz
sudo cp node_exporter-1.7.0.linux-amd64/node_exporter /usr/local/bin/
sudo useradd -rs /bin/false node_exporter
sudo mkdir -p /var/lib/node_exporter/textfile_collector
sudo chown node_exporter:node_exporter /var/lib/node_exporter/textfile_collector
[Unit]
Description=Prometheus Node Exporter
Wants=network-online.target
After=network-online.target
[Service]
User=node_exporter
Group=node_exporter
Type=simple
ExecStart=/usr/local/bin/node_exporter --collector.textfile.directory=/var/lib/node_exporter/textfile_collector
Restart=always
RestartSec=3
[Install]
WantedBy=multi-user.target
Create Prometheus alerting rules
Configure alerts for backup failures and validation issues.
groups:
- name: scylladb_backup
rules:
- alert: ScyllaDBBackupFailed
expr: |
(
time() - scylladb_backup_last_success_timestamp > 90000
) or (
scylladb_backup_validation_status > 0
)
for: 5m
labels:
severity: critical
service: scylladb
component: backup
annotations:
summary: "ScyllaDB backup validation failed"
description: |
ScyllaDB backup on {{ $labels.instance }} has failed validation or is overdue.
- Last successful backup: {{ $value | humanizeTimestamp }}
- Validation status: {{ $labels.scylladb_backup_validation_status }}
Check backup logs for details.
- alert: ScyllaDBRestoreTestFailed
expr: scylladb_restore_test_status > 0
for: 5m
labels:
severity: critical
service: scylladb
component: backup
annotations:
summary: "ScyllaDB restore test failed"
description: |
ScyllaDB restore test on {{ $labels.instance }} has failed.
This indicates backups may not be recoverable.
Test duration: {{ $labels.scylladb_restore_test_duration_seconds }}s
Check restore test logs immediately.
- alert: ScyllaDBBackupSizeAnomaly
expr: |
(
scylladb_backup_size_bytes /
avg_over_time(scylladb_backup_size_bytes[7d])
) < 0.5 or (
scylladb_backup_size_bytes /
avg_over_time(scylladb_backup_size_bytes[7d])
) > 2
for: 5m
labels:
severity: warning
service: scylladb
component: backup
annotations:
summary: "ScyllaDB backup size anomaly detected"
description: |
ScyllaDB backup size on {{ $labels.instance }} is significantly different from the 7-day average.
Current size: {{ $value | humanizeBytes }}
This may indicate data corruption or incomplete backup.
- alert: ScyllaDBRestoreTestOverdue
expr: time() - scylladb_restore_test_last_run_timestamp > 604800
for: 1h
labels:
severity: warning
service: scylladb
component: backup
annotations:
summary: "ScyllaDB restore test is overdue"
description: |
ScyllaDB restore test on {{ $labels.instance }} hasn't run in over 7 days.
Last test: {{ $value | humanizeTimestamp }}
Regular restore testing is critical for backup validation.
Enable and start services
Enable all backup automation services and start monitoring.
sudo systemctl daemon-reload
Enable and start node_exporter
sudo systemctl enable --now node_exporter
Enable backup and restore test timers
sudo systemctl enable scylladb-backup.timer
sudo systemctl enable scylladb-restore-test.timer
Start timers
sudo systemctl start scylladb-backup.timer
sudo systemctl start scylladb-restore-test.timer
Create manual execution scripts
Create wrapper scripts for manual backup and restore testing.
#!/bin/bash
Manual backup execution script
set -euo pipefail
echo "Starting manual ScyllaDB backup..."
Set environment variables
export SCYLLA_KEYSPACES="${1:-all}"
export BACKUP_RETENTION_DAYS="${2:-7}"
echo "Keyspaces: $SCYLLA_KEYSPACES"
echo "Retention: $BACKUP_RETENTION_DAYS days"
echo ""
Run backup script
/opt/scylladb-backup/scripts/backup.sh
echo ""
echo "Backup completed. Check logs in /opt/scylladb-backup/logs/"
echo "View metrics at: http://localhost:9100/metrics"
#!/bin/bash
Manual restore test execution script
set -euo pipefail
echo "Starting manual ScyllaDB restore test..."
echo "This will create temporary test data and validate backup recovery."
echo "Process may take several minutes..."
echo ""
Run restore test
/opt/scylladb-backup/scripts/restore-test.sh
echo ""
echo "Restore test completed. Check logs in /opt/scylladb-backup/logs/"
echo "View metrics at: http://localhost:9100/metrics"
sudo chmod +x /opt/scylladb-backup/scripts/manual-*.sh
sudo chown scylla:scylla /opt/scylladb-backup/scripts/manual-*.sh
Configure Grafana monitoring dashboards
Import ScyllaDB backup dashboard
Create a comprehensive Grafana dashboard for monitoring backup health.
{
"dashboard": {
"id": null,
"title": "ScyllaDB Backup Monitoring",
"tags": ["scylladb", "backup", "monitoring"],
"timezone": "browser",
"panels": [
{
"id": 1,
"title": "Backup Status Overview",
"type": "stat",
"targets": [
{
"expr": "scylladb_backup_validation_status",
"legendFormat": "Backup Status"
}
],
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": {
"0": {
"text": "SUCCESS",
"color": "green"
},
"1": {
"text": "FAILED",
"color": "red"
}
}
}
]
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
}
},
{
"id": 2,
"title": "Restore Test Status",
"type": "stat",
"targets": [
{
"expr": "scylladb_restore_test_status",
"legendFormat": "Restore Test Status"
}
],
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": {
"0": {
"text": "SUCCESS",
"color": "green"
},
"1": {
"text": "FAILED",
"color": "red"
}
}
}
]
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
}
},
{
"id": 3,
"title": "Backup Size Trend",
"type": "timeseries",
"targets": [
{
"expr": "scylladb_backup_size_bytes",
"legendFormat": "Backup Size (Bytes)"
}
],
"fieldConfig": {
"defaults": {
"unit": "bytes"
}
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 8
}
},
{
"id": 4,
"title": "Restore Test Duration",
"type": "timeseries",
"targets": [
{
"expr": "scylladb_restore_test_duration_seconds",
"legendFormat": "Test Duration (Seconds)"
}
],
"fieldConfig": {
"defaults": {
"unit": "s"
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 16
}
},
{
"id": 5,
"title": "Backup File Count",
"type": "timeseries",
"targets": [
{
"expr": "scylladb_backup_file_count",
"legendFormat": "File Count"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 16
}
}
],
"time": {
"from": "now-7d",
"to": "now"
},
"refresh": "5m"
}
}
Verify your setup
Test the backup validation and restore testing system.
# Check service status
sudo systemctl status scylladb-backup.timer
sudo systemctl status scylladb-restore-test.timer
sudo systemctl status node_exporter
Run manual backup test
sudo -u scylla /opt/scylladb-backup/scripts/manual-backup.sh
Check backup was created
ls -la /opt/scylladb-backup/snapshots/
cat /opt/scylladb-backup/snapshots/*/metadata.json
Run manual restore test
sudo -u scylla /opt/scylladb-backup/scripts/manual-restore-test.sh
Check metrics are exposed
curl -s http://localhost:9100/metrics | grep scylladb_backup
View recent logs
sudo journalctl -u scylladb-backup.service -n 50
sudo journalctl -u scylladb-restore-test.service -n 50
Common issues
| Symptom | Cause | Fix |
|---|---|---|
| Backup script fails with permission denied | Incorrect file ownership or permissions | sudo chown -R scylla:scylla /opt/scylladb-backup && sudo chmod -R 755 /opt/scylladb-backup |
| Snapshot creation fails | ScyllaDB not running or insufficient space | Check sudo systemctl status scylla-server and disk space with df -h |
| Restore test hangs during ScyllaDB restart | ScyllaDB taking too long to start | Increase timeout in restore script or check ScyllaDB logs: sudo journalctl -u scylla-server |
| Metrics not appearing in Prometheus | node_exporter not collecting textfile metrics | Verify textfile collector directory: ls -la /var/lib/node_exporter/textfile_collector/ |
| Validation fails with checksum mismatch | Backup corruption during transfer | Check disk health with sudo smartctl -a /dev/sda and retry backup |
| Timer not executing automatically | Systemd timer misconfiguration | sudo systemctl list-timers scylladb-* and check timer syntax |
Next steps
- Configure ScyllaDB cluster monitoring with Prometheus and Grafana dashboards
- Implement ScyllaDB disaster recovery with cross-region replication
- Setup S3-compatible disaster recovery with cross-region replication using MinIO
- Configure ScyllaDB SSL encryption and authentication with certificate management
- Implement backup encryption key rotation and secure management with GPG
Running this in production?
Automated install script
Run this to automate the entire setup
#!/usr/bin/env bash
set -euo pipefail
# ScyllaDB Backup Validation and Restore Testing Setup Script
# Production-quality installer with multi-distro support
# Colors for output
readonly RED='\033[0;31m'
readonly GREEN='\033[0;32m'
readonly YELLOW='\033[1;33m'
readonly NC='\033[0m'
# Configuration
readonly SCRIPT_NAME="$(basename "$0")"
readonly BACKUP_USER="${BACKUP_USER:-scylla}"
readonly BACKUP_DIR="/opt/scylladb-backup"
readonly RETENTION_DAYS="${RETENTION_DAYS:-7}"
# Function definitions
error() { echo -e "${RED}ERROR: $1${NC}" >&2; exit 1; }
warn() { echo -e "${YELLOW}WARNING: $1${NC}" >&2; }
success() { echo -e "${GREEN}SUCCESS: $1${NC}"; }
info() { echo -e "$1"; }
cleanup() {
if [ $? -ne 0 ]; then
warn "Installation failed. Cleaning up..."
[ -d "$BACKUP_DIR" ] && rm -rf "$BACKUP_DIR"
fi
}
usage() {
cat << EOF
Usage: $SCRIPT_NAME [OPTIONS]
Setup ScyllaDB backup validation and automated restore testing
OPTIONS:
-u USER Backup user (default: scylla)
-d DAYS Retention period in days (default: 7)
-h Show this help
ENVIRONMENT VARIABLES:
BACKUP_USER User to run backups as
RETENTION_DAYS Backup retention period
EOF
}
check_prerequisites() {
info "[1/8] Checking prerequisites..."
# Check if running as root or with sudo
if [ "$EUID" -ne 0 ]; then
error "This script must be run as root or with sudo"
fi
# Check if ScyllaDB is installed
if ! command -v nodetool &> /dev/null; then
error "ScyllaDB/Cassandra nodetool not found. Please install ScyllaDB first"
fi
success "Prerequisites check passed"
}
detect_distro() {
info "[2/8] Detecting operating system..."
if [ ! -f /etc/os-release ]; then
error "Cannot detect operating system. /etc/os-release not found"
fi
. /etc/os-release
case "$ID" in
ubuntu|debian)
PKG_MGR="apt"
PKG_UPDATE="apt update"
PKG_INSTALL="apt install -y"
EPEL_INSTALL=""
;;
almalinux|rocky|centos|rhel|ol)
PKG_MGR="dnf"
PKG_UPDATE="dnf check-update || true"
PKG_INSTALL="dnf install -y"
EPEL_INSTALL="dnf install -y epel-release"
;;
fedora)
PKG_MGR="dnf"
PKG_UPDATE="dnf check-update || true"
PKG_INSTALL="dnf install -y"
EPEL_INSTALL=""
;;
amzn)
PKG_MGR="yum"
PKG_UPDATE="yum check-update || true"
PKG_INSTALL="yum install -y"
EPEL_INSTALL="yum install -y epel-release"
;;
*)
error "Unsupported distribution: $ID"
;;
esac
success "Detected $PRETTY_NAME"
}
install_dependencies() {
info "[3/8] Installing dependencies..."
$PKG_UPDATE
# Install EPEL if needed
if [ -n "$EPEL_INSTALL" ]; then
$EPEL_INSTALL
fi
# Install required packages
$PKG_INSTALL python3 python3-pip jq pigz parallel curl
# Install Python packages
pip3 install --upgrade pip
pip3 install prometheus-client
success "Dependencies installed"
}
create_backup_user() {
info "[4/8] Setting up backup user..."
if ! id "$BACKUP_USER" &>/dev/null; then
warn "User $BACKUP_USER not found. Creating system user..."
useradd -r -s /bin/bash -d /var/lib/scylla -G scylla "$BACKUP_USER" 2>/dev/null || true
fi
success "Backup user configured"
}
create_directories() {
info "[5/8] Creating directory structure..."
# Create main directories
mkdir -p "$BACKUP_DIR"/{snapshots,validation,restore-test,scripts,logs}
# Create node_exporter textfile directory if it doesn't exist
mkdir -p /var/lib/node_exporter/textfile_collector
# Set proper ownership and permissions
chown -R "$BACKUP_USER:scylla" "$BACKUP_DIR"
chmod -R 755 "$BACKUP_DIR"
# Ensure scylla user can write to backup directory
chmod g+w "$BACKUP_DIR"/{snapshots,validation,restore-test,logs}
success "Directory structure created"
}
create_backup_script() {
info "[6/8] Creating backup validation script..."
cat > "$BACKUP_DIR/scripts/backup_validator.sh" << 'EOF'
#!/usr/bin/env bash
set -euo pipefail
# Configuration
BACKUP_DIR="/opt/scylladb-backup"
SNAPSHOT_DIR="$BACKUP_DIR/snapshots"
VALIDATION_DIR="$BACKUP_DIR/validation"
RESTORE_TEST_DIR="$BACKUP_DIR/restore-test"
LOG_DIR="$BACKUP_DIR/logs"
KEYSPACES="${SCYLLA_KEYSPACES:-system_schema}"
RETENTION_DAYS=${BACKUP_RETENTION_DAYS:-7}
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
SNAPSHOT_TAG="backup_$TIMESTAMP"
LOG_FILE="$LOG_DIR/backup_$TIMESTAMP.log"
METRICS_FILE="/var/lib/node_exporter/textfile_collector/scylladb_backup.prom"
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') $1" | tee -a "$LOG_FILE"
}
create_snapshot() {
log "Creating snapshot: $SNAPSHOT_TAG"
nodetool clearsnapshot 2>/dev/null || true
if [ "$KEYSPACES" = "all" ]; then
nodetool snapshot -t "$SNAPSHOT_TAG"
else
for ks in $KEYSPACES; do
nodetool snapshot -t "$SNAPSHOT_TAG" "$ks"
done
fi
log "Snapshot created successfully"
}
copy_snapshot() {
log "Copying snapshot data to backup directory"
local snapshot_backup_dir="$SNAPSHOT_DIR/$TIMESTAMP"
mkdir -p "$snapshot_backup_dir"
find /var/lib/scylla/data -name "$SNAPSHOT_TAG" -type d | while read -r snap_dir; do
local rel_path=$(echo "$snap_dir" | sed "s|/var/lib/scylla/data/||")
local dest_dir="$snapshot_backup_dir/$rel_path"
mkdir -p "$(dirname "$dest_dir")"
log "Copying $snap_dir to $dest_dir"
tar -I pigz -cf "${dest_dir}.tar.gz" -C "$(dirname "$snap_dir")" "$(basename "$snap_dir")"
done
create_metadata "$snapshot_backup_dir"
log "Snapshot copy completed"
}
create_metadata() {
local backup_dir="$1"
local metadata_file="$backup_dir/metadata.json"
log "Creating backup metadata"
local cluster_name=$(nodetool describecluster | grep "Name:" | awk '{print $2}')
local node_id=$(nodetool info | grep "ID" | awk '{print $2}')
local checksums_file="$backup_dir/checksums.sha256"
find "$backup_dir" -name "*.tar.gz" -exec sha256sum {} + > "$checksums_file"
cat > "$metadata_file" << EOJ
{
"timestamp": "$TIMESTAMP",
"snapshot_tag": "$SNAPSHOT_TAG",
"cluster_name": "$cluster_name",
"node_id": "$node_id",
"keyspaces": "$KEYSPACES",
"backup_size": $(du -sb "$backup_dir" | cut -f1),
"file_count": $(find "$backup_dir" -name "*.tar.gz" | wc -l)
}
EOJ
}
validate_backup() {
log "Validating backup integrity"
local backup_dir="$SNAPSHOT_DIR/$TIMESTAMP"
local validation_log="$VALIDATION_DIR/validation_$TIMESTAMP.log"
local validation_status=0
if [ ! -f "$backup_dir/metadata.json" ]; then
echo "FAIL: metadata.json missing" >> "$validation_log"
validation_status=1
fi
if [ ! -f "$backup_dir/checksums.sha256" ]; then
echo "FAIL: checksums.sha256 missing" >> "$validation_log"
validation_status=1
else
cd "$backup_dir"
if sha256sum -c checksums.sha256 >> "$validation_log" 2>&1; then
echo "PASS: All checksums verified" >> "$validation_log"
else
echo "FAIL: Checksum verification failed" >> "$validation_log"
validation_status=1
fi
fi
update_metrics $validation_status
return $validation_status
}
update_metrics() {
local status=$1
local backup_size=$(du -sb "$SNAPSHOT_DIR/$TIMESTAMP" 2>/dev/null | cut -f1 || echo 0)
cat > "$METRICS_FILE" << EOM
# HELP scylladb_backup_last_success_timestamp Unix timestamp of last successful backup
# TYPE scylladb_backup_last_success_timestamp gauge
scylladb_backup_last_success_timestamp $(date +%s)
# HELP scylladb_backup_status Status of last backup (0=failed, 1=success)
# TYPE scylladb_backup_status gauge
scylladb_backup_status $((1-status))
# HELP scylladb_backup_size_bytes Size of last backup in bytes
# TYPE scylladb_backup_size_bytes gauge
scylladb_backup_size_bytes $backup_size
EOM
}
cleanup_old_backups() {
log "Cleaning up backups older than $RETENTION_DAYS days"
find "$SNAPSHOT_DIR" -maxdepth 1 -type d -mtime +$RETENTION_DAYS -exec rm -rf {} + || true
find "$VALIDATION_DIR" -name "*.log" -mtime +$RETENTION_DAYS -delete || true
find "$LOG_DIR" -name "*.log" -mtime +$RETENTION_DAYS -delete || true
nodetool clearsnapshot 2>/dev/null || true
}
main() {
log "Starting backup process"
create_snapshot
copy_snapshot
if validate_backup; then
log "Backup validation successful"
else
log "Backup validation failed"
exit 1
fi
cleanup_old_backups
log "Backup process completed"
}
main "$@"
EOF
chmod 755 "$BACKUP_DIR/scripts/backup_validator.sh"
chown "$BACKUP_USER:scylla" "$BACKUP_DIR/scripts/backup_validator.sh"
success "Backup validation script created"
}
configure_systemd() {
info "[7/8] Configuring systemd timer..."
# Create systemd service
cat > /etc/systemd/system/scylladb-backup.service << EOF
[Unit]
Description=ScyllaDB Backup Validation Service
After=scylla-server.service
Requires=scylla-server.service
[Service]
Type=oneshot
User=$BACKUP_USER
Group=scylla
Environment=BACKUP_RETENTION_DAYS=$RETENTION_DAYS
ExecStart=$BACKUP_DIR/scripts/backup_validator.sh
WorkingDirectory=$BACKUP_DIR
StandardOutput=journal
StandardError=journal
EOF
# Create systemd timer
cat > /etc/systemd/system/scylladb-backup.timer << EOF
[Unit]
Description=ScyllaDB Backup Validation Timer
Requires=scylladb-backup.service
[Timer]
OnCalendar=daily
RandomizedDelaySec=1h
Persistent=true
[Install]
WantedBy=timers.target
EOF
systemctl daemon-reload
systemctl enable scylladb-backup.timer
systemctl start scylladb-backup.timer
success "Systemd timer configured and started"
}
verify_installation() {
info "[8/8] Verifying installation..."
# Check directories
for dir in snapshots validation restore-test scripts logs; do
[ -d "$BACKUP_DIR/$dir" ] || error "Directory $BACKUP_DIR/$dir not found"
done
# Check script permissions
[ -x "$BACKUP_DIR/scripts/backup_validator.sh" ] || error "Backup script not executable"
# Check systemd timer
if ! systemctl is-enabled scylladb-backup.timer &>/dev/null; then
error "Systemd timer not enabled"
fi
# Test backup script syntax
bash -n "$BACKUP_DIR/scripts/backup_validator.sh" || error "Backup script syntax error"
success "Installation verified successfully"
info "ScyllaDB backup validation setup complete!"
info "- Backups will run daily via systemd timer"
info "- Logs available in: $LOG_DIR"
info "- Manual backup: sudo -u $BACKUP_USER $BACKUP_DIR/scripts/backup_validator.sh"
info "- Timer status: systemctl status scylladb-backup.timer"
}
# Parse command line arguments
while getopts "u:d:h" opt; do
case $opt in
u) BACKUP_USER="$OPTARG" ;;
d) RETENTION_DAYS="$OPTARG" ;;
h) usage; exit 0 ;;
*) usage; exit 1 ;;
esac
done
# Set up error handling
trap cleanup ERR
# Main execution
check_prerequisites
detect_distro
install_dependencies
create_backup_user
create_directories
create_backup_script
configure_systemd
verify_installation
Review the script before running. Execute with: bash install.sh