Set up comprehensive monitoring for ScyllaDB clusters using Prometheus metrics collection and Grafana visualization dashboards. Configure alerting rules for performance monitoring and health checks.
Prerequisites
- ScyllaDB cluster running
- Root or sudo access
- 4GB RAM minimum
- Network access between nodes
What this solves
ScyllaDB provides extensive metrics through its built-in monitoring endpoints, but collecting and visualizing these metrics requires proper setup. This tutorial shows you how to configure Prometheus to scrape ScyllaDB metrics, set up Grafana dashboards for cluster visualization, and implement alerting rules for proactive monitoring of your NoSQL database cluster.
Step-by-step configuration
Install Prometheus
First, install Prometheus to collect metrics from your ScyllaDB cluster.
sudo apt update
wget https://github.com/prometheus/prometheus/releases/download/v2.48.0/prometheus-2.48.0.linux-amd64.tar.gz
tar xzf prometheus-2.48.0.linux-amd64.tar.gz
sudo mv prometheus-2.48.0.linux-amd64 /opt/prometheus
sudo useradd --system --shell /bin/false prometheus
sudo chown -R prometheus:prometheus /opt/prometheus
Configure Prometheus for ScyllaDB
Create the Prometheus configuration file with ScyllaDB scrape targets. ScyllaDB exposes metrics on port 9180 by default.
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- "scylla_rules.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
- localhost:9093
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'scylla'
static_configs:
- targets:
- '203.0.113.10:9180'
- '203.0.113.11:9180'
- '203.0.113.12:9180'
scrape_interval: 10s
metrics_path: /metrics
params:
format: [prometheus]
- job_name: 'scylla-manager'
static_configs:
- targets:
- '203.0.113.10:56090'
scrape_interval: 30s
Create ScyllaDB alerting rules
Define alerting rules specific to ScyllaDB performance and health monitoring.
groups:
- name: scylla.rules
rules:
- alert: ScyllaNodeDown
expr: up{job="scylla"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "ScyllaDB node is down"
description: "ScyllaDB node {{ $labels.instance }} has been down for more than 1 minute."
- alert: ScyllaHighCPU
expr: scylla_reactor_utilization > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU utilization on ScyllaDB node"
description: "CPU utilization is {{ $value }} on {{ $labels.instance }}"
- alert: ScyllaHighLatency
expr: scylla_storage_proxy_coordinator_read_latency{quantile="0.99"} > 100000
for: 2m
labels:
severity: warning
annotations:
summary: "High read latency detected"
description: "99th percentile read latency is {{ $value }}us on {{ $labels.instance }}"
- alert: ScyllaLowDiskSpace
expr: scylla_node_filesystem_avail_bytes / scylla_node_filesystem_size_bytes < 0.1
for: 1m
labels:
severity: critical
annotations:
summary: "Low disk space on ScyllaDB node"
description: "Available disk space is below 10% on {{ $labels.instance }}"
- alert: ScyllaCompactionBacklog
expr: scylla_compaction_manager_pending_tasks > 100
for: 10m
labels:
severity: warning
annotations:
summary: "High compaction backlog"
description: "Compaction backlog has {{ $value }} pending tasks on {{ $labels.instance }}"
- alert: ScyllaHighMemoryUsage
expr: scylla_memory_allocated_bytes / scylla_memory_total_bytes > 0.9
for: 5m
labels:
severity: critical
annotations:
summary: "High memory usage on ScyllaDB node"
description: "Memory usage is {{ $value | humanizePercentage }} on {{ $labels.instance }}"
- alert: ScyllaTimeouts
expr: rate(scylla_storage_proxy_coordinator_read_timeouts_total[5m]) > 1
for: 2m
labels:
severity: warning
annotations:
summary: "High timeout rate detected"
description: "Read timeout rate is {{ $value }}/sec on {{ $labels.instance }}"
- alert: ScyllaErrorRate
expr: rate(scylla_storage_proxy_coordinator_read_errors_total[5m]) > 0.1
for: 1m
labels:
severity: critical
annotations:
summary: "High error rate detected"
description: "Read error rate is {{ $value }}/sec on {{ $labels.instance }}"
- alert: ScyllaStreamingErrors
expr: rate(scylla_streaming_total_incoming_bytes[5m]) == 0 and scylla_node_operation_mode{mode="NORMAL"} == 1
for: 15m
labels:
severity: warning
annotations:
summary: "No streaming activity detected"
description: "No incoming streaming detected on {{ $labels.instance }} during repair/bootstrap"
- alert: ScyllaLargePartitions
expr: scylla_large_partition_exceeding_threshold_total > 0
for: 1m
labels:
severity: warning
annotations:
summary: "Large partitions detected"
description: "{{ $value }} large partitions found on {{ $labels.instance }}"
- alert: ScyllaGCPressure
expr: rate(scylla_memory_free_bytes[5m]) < 0
for: 5m
labels:
severity: warning
annotations:
summary: "Memory pressure detected"
description: "Decreasing free memory trend on {{ $labels.instance }}"
- alert: ScyllaConnectionErrors
expr: rate(scylla_cql_connections_rejected_total[5m]) > 1
for: 2m
labels:
severity: critical
annotations:
summary: "High connection rejection rate"
description: "CQL connection rejection rate is {{ $value }}/sec on {{ $labels.instance }}"
- alert: ScyllaRepairProgress
expr: scylla_repair_segment_total == 0 and on(instance) scylla_node_operation_mode{mode="NORMAL"} == 1
for: 24h
labels:
severity: warning
annotations:
summary: "No repair activity in 24 hours"
description: "Node {{ $labels.instance }} has not run repair in over 24 hours"
Create Prometheus systemd service
Set up Prometheus as a systemd service for automatic startup and management.
[Unit]
Description=Prometheus
Wants=network-online.target
After=network-online.target
[Service]
User=prometheus
Group=prometheus
Type=simple
ExecStart=/opt/prometheus/prometheus \
--config.file=/opt/prometheus/prometheus.yml \
--storage.tsdb.path=/opt/prometheus/data \
--web.console.templates=/opt/prometheus/consoles \
--web.console.libraries=/opt/prometheus/console_libraries \
--web.listen-address=0.0.0.0:9090 \
--web.enable-lifecycle
[Install]
WantedBy=multi-user.target
Install and configure Grafana
Install Grafana for creating dashboards and visualizations of ScyllaDB metrics.
sudo apt install -y software-properties-common
wget -q -O - https://packages.grafana.com/gpg.key | sudo apt-key add -
echo "deb https://packages.grafana.com/oss/deb stable main" | sudo tee /etc/apt/sources.list.d/grafana.list
sudo apt update
sudo apt install -y grafana
Start monitoring services
Enable and start both Prometheus and Grafana services.
sudo mkdir -p /opt/prometheus/data
sudo chown prometheus:prometheus /opt/prometheus/data
sudo systemctl daemon-reload
sudo systemctl enable --now prometheus
sudo systemctl enable --now grafana-server
Configure Grafana data source
Add Prometheus as a data source in Grafana and import ScyllaDB dashboards.
curl -X POST http://admin:admin@localhost:3000/api/datasources \
-H "Content-Type: application/json" \
-d '{
"name": "Prometheus",
"type": "prometheus",
"url": "http://localhost:9090",
"access": "proxy",
"basicAuth": false
}'
Import ScyllaDB dashboard
Create a comprehensive dashboard for ScyllaDB cluster monitoring with key performance indicators.
curl -X POST http://admin:admin@localhost:3000/api/dashboards/db \
-H "Content-Type: application/json" \
-d @- <<'EOF'
{
"dashboard": {
"id": null,
"title": "ScyllaDB Cluster Overview",
"tags": ["scylla", "database"],
"timezone": "browser",
"panels": [
{
"id": 1,
"title": "Node Status",
"type": "stat",
"targets": [
{
"expr": "up{job=\"scylla\"}",
"legendFormat": "{{instance}}"
}
],
"gridPos": {"h": 4, "w": 6, "x": 0, "y": 0}
},
{
"id": 2,
"title": "CPU Utilization",
"type": "graph",
"targets": [
{
"expr": "scylla_reactor_utilization",
"legendFormat": "{{instance}}"
}
],
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 4}
},
{
"id": 3,
"title": "Read Latency (99th percentile)",
"type": "graph",
"targets": [
{
"expr": "scylla_storage_proxy_coordinator_read_latency{quantile=\"0.99\"}",
"legendFormat": "{{instance}}"
}
],
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 4}
},
{
"id": 4,
"title": "Write Latency (99th percentile)",
"type": "graph",
"targets": [
{
"expr": "scylla_storage_proxy_coordinator_write_latency{quantile=\"0.99\"}",
"legendFormat": "{{instance}}"
}
],
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 12}
},
{
"id": 5,
"title": "Operations per Second",
"type": "graph",
"targets": [
{
"expr": "rate(scylla_cql_reads_total[5m])",
"legendFormat": "Reads - {{instance}}"
},
{
"expr": "rate(scylla_cql_inserts_total[5m])",
"legendFormat": "Writes - {{instance}}"
}
],
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 12}
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"refresh": "5s"
}
}
EOF
Install Alertmanager
Set up Alertmanager to handle alerts generated by Prometheus rules.
wget https://github.com/prometheus/alertmanager/releases/download/v0.26.0/alertmanager-0.26.0.linux-amd64.tar.gz
tar xzf alertmanager-0.26.0.linux-amd64.tar.gz
sudo mv alertmanager-0.26.0.linux-amd64 /opt/alertmanager
sudo useradd --system --shell /bin/false alertmanager
sudo chown -R alertmanager:alertmanager /opt/alertmanager
Configure Alertmanager
Set up email notifications for ScyllaDB alerts.
global:
smtp_smarthost: 'localhost:587'
smtp_from: 'alertmanager@example.com'
smtp_auth_username: 'alertmanager@example.com'
smtp_auth_password: 'your-email-password'
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'web.hook'
routes:
- match:
severity: critical
receiver: 'critical-email'
- match:
severity: warning
receiver: 'warning-email'
receivers:
- name: 'web.hook'
webhook_configs:
- url: 'http://127.0.0.1:5001/'
- name: 'critical-email'
email_configs:
- to: 'admin@example.com'
subject: 'CRITICAL: ScyllaDB Alert - {{ .GroupLabels.alertname }}'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
Instance: {{ .Labels.instance }}
Severity: {{ .Labels.severity }}
{{ end }}
- name: 'warning-email'
email_configs:
- to: 'monitoring@example.com'
subject: 'WARNING: ScyllaDB Alert - {{ .GroupLabels.alertname }}'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
Instance: {{ .Labels.instance }}
{{ end }}
Create Alertmanager service
Set up Alertmanager as a systemd service.
[Unit]
Description=Alertmanager
Wants=network-online.target
After=network-online.target
[Service]
User=alertmanager
Group=alertmanager
Type=simple
ExecStart=/opt/alertmanager/alertmanager \
--config.file=/opt/alertmanager/alertmanager.yml \
--storage.path=/opt/alertmanager/data
[Install]
WantedBy=multi-user.target
Start Alertmanager
Enable and start the Alertmanager service.
sudo mkdir -p /opt/alertmanager/data
sudo chown alertmanager:alertmanager /opt/alertmanager/data
sudo systemctl daemon-reload
sudo systemctl enable --now alertmanager
Configure ScyllaDB monitoring agent
Install and configure the ScyllaDB monitoring agent for enhanced metrics collection.
wget https://github.com/scylladb/scylla-monitoring/archive/refs/tags/scylla-monitoring-4.5.0.tar.gz
tar xzf scylla-monitoring-4.5.0.tar.gz
sudo mv scylla-monitoring-4.5.0 /opt/scylla-monitoring
sudo chown -R prometheus:prometheus /opt/scylla-monitoring
Import advanced ScyllaDB dashboards
Import official ScyllaDB Grafana dashboards for comprehensive monitoring.
cd /opt/scylla-monitoring
sudo -u prometheus ./start-grafana.sh -s prometheus_servers.yml -n node_exporter_servers.yml -G
Verify your setup
Check that all monitoring components are running and collecting metrics properly.
sudo systemctl status prometheus
sudo systemctl status grafana-server
sudo systemctl status alertmanager
curl http://localhost:9090/api/v1/targets
curl http://localhost:3000/api/health
curl http://localhost:9093/api/v1/status
Access Grafana at http://your-server:3000 (admin/admin) and verify that ScyllaDB metrics are being collected. Check the dashboard shows current cluster status and performance metrics.
Common issues
| Symptom | Cause | Fix |
|---|---|---|
| No metrics from ScyllaDB | Wrong port or endpoint | Verify ScyllaDB metrics endpoint: curl http://node:9180/metrics |
| Prometheus can't scrape targets | Firewall blocking access | Open port 9180: sudo ufw allow 9180 |
| Grafana shows no data | Data source not configured | Check Prometheus data source URL in Grafana settings |
| Alerts not firing | Alertmanager not connected | Verify Alertmanager target in Prometheus: http://localhost:9090/alerts |
| Dashboard import fails | JSON format error | Use Grafana UI to import dashboard ID 9614 for ScyllaDB |
| High memory usage | Too many metrics retained | Adjust Prometheus retention: --storage.tsdb.retention.time=30d |
Next steps
- Configure ScyllaDB SSL encryption and authentication for production security
- Configure Prometheus Alertmanager with Slack integration for team notifications
- Setup ScyllaDB backup automation with S3 integration
- Implement ScyllaDB performance tuning and optimization
- Configure advanced Grafana dashboards and alerting with custom metrics
Running this in production?
Automated install script
Run this to automate the entire setup
#!/usr/bin/env bash
set -euo pipefail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
# Configuration
PROMETHEUS_VERSION="2.48.0"
PROMETHEUS_USER="prometheus"
PROMETHEUS_HOME="/opt/prometheus"
PROMETHEUS_CONFIG="/etc/prometheus"
PROMETHEUS_DATA="/var/lib/prometheus"
SCYLLA_NODES="${SCYLLA_NODES:-}"
# Function to print colored output
print_status() {
echo -e "${GREEN}[INFO]${NC} $1"
}
print_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# Usage function
usage() {
echo "Usage: $0 [SCYLLA_NODE1,SCYLLA_NODE2,...]"
echo "Example: $0 203.0.113.10,203.0.113.11,203.0.113.12"
echo "If no nodes specified, you'll need to edit the config manually"
exit 1
}
# Cleanup function
cleanup() {
if [[ $? -ne 0 ]]; then
print_error "Script failed. Cleaning up..."
systemctl stop prometheus 2>/dev/null || true
systemctl disable prometheus 2>/dev/null || true
rm -f /etc/systemd/system/prometheus.service
rm -rf "$PROMETHEUS_HOME" "$PROMETHEUS_CONFIG" "$PROMETHEUS_DATA"
userdel "$PROMETHEUS_USER" 2>/dev/null || true
fi
}
trap cleanup ERR
# Check if running as root
if [[ $EUID -ne 0 ]]; then
print_error "This script must be run as root or with sudo"
exit 1
fi
# Parse arguments
if [[ $# -gt 1 ]]; then
usage
elif [[ $# -eq 1 ]]; then
SCYLLA_NODES="$1"
fi
# Auto-detect distribution
if [[ -f /etc/os-release ]]; then
. /etc/os-release
case "$ID" in
ubuntu|debian)
PKG_MGR="apt"
PKG_UPDATE="apt update"
PKG_INSTALL="apt install -y"
;;
almalinux|rocky|centos|rhel|ol|fedora)
PKG_MGR="dnf"
PKG_UPDATE="dnf update -y"
PKG_INSTALL="dnf install -y"
;;
amzn)
PKG_MGR="yum"
PKG_UPDATE="yum update -y"
PKG_INSTALL="yum install -y"
;;
*)
print_error "Unsupported distribution: $ID"
exit 1
;;
esac
else
print_error "Cannot detect distribution. /etc/os-release not found"
exit 1
fi
print_status "[1/8] Updating package manager..."
$PKG_UPDATE
print_status "[2/8] Installing required packages..."
$PKG_INSTALL wget tar
print_status "[3/8] Creating prometheus user..."
if ! id "$PROMETHEUS_USER" &>/dev/null; then
useradd --system --shell /bin/false --home-dir /var/lib/prometheus --create-home "$PROMETHEUS_USER"
fi
print_status "[4/8] Downloading and installing Prometheus..."
cd /tmp
wget -q "https://github.com/prometheus/prometheus/releases/download/v${PROMETHEUS_VERSION}/prometheus-${PROMETHEUS_VERSION}.linux-amd64.tar.gz"
tar xzf "prometheus-${PROMETHEUS_VERSION}.linux-amd64.tar.gz"
rm -rf "$PROMETHEUS_HOME"
mv "prometheus-${PROMETHEUS_VERSION}.linux-amd64" "$PROMETHEUS_HOME"
mkdir -p "$PROMETHEUS_CONFIG" "$PROMETHEUS_DATA"
chown -R "$PROMETHEUS_USER":"$PROMETHEUS_USER" "$PROMETHEUS_HOME" "$PROMETHEUS_CONFIG" "$PROMETHEUS_DATA"
chmod 755 "$PROMETHEUS_HOME" "$PROMETHEUS_CONFIG" "$PROMETHEUS_DATA"
ln -sf "$PROMETHEUS_HOME/prometheus" /usr/local/bin/prometheus
ln -sf "$PROMETHEUS_HOME/promtool" /usr/local/bin/promtool
print_status "[5/8] Creating Prometheus configuration..."
cat > "$PROMETHEUS_CONFIG/prometheus.yml" << EOF
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- "scylla_rules.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
- localhost:9093
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'scylla'
static_configs:
- targets:
EOF
if [[ -n "$SCYLLA_NODES" ]]; then
IFS=',' read -ra NODES <<< "$SCYLLA_NODES"
for node in "${NODES[@]}"; do
echo " - '$node:9180'" >> "$PROMETHEUS_CONFIG/prometheus.yml"
done
else
cat >> "$PROMETHEUS_CONFIG/prometheus.yml" << EOF
- 'localhost:9180'
EOF
fi
cat >> "$PROMETHEUS_CONFIG/prometheus.yml" << EOF
scrape_interval: 10s
metrics_path: /metrics
params:
format: [prometheus]
- job_name: 'scylla-manager'
static_configs:
- targets:
- 'localhost:56090'
scrape_interval: 30s
EOF
print_status "[6/8] Creating ScyllaDB alerting rules..."
cat > "$PROMETHEUS_CONFIG/scylla_rules.yml" << 'EOF'
groups:
- name: scylla.rules
rules:
- alert: ScyllaNodeDown
expr: up{job="scylla"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "ScyllaDB node is down"
description: "ScyllaDB node {{ $labels.instance }} has been down for more than 1 minute."
- alert: ScyllaHighCPU
expr: scylla_reactor_utilization > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU utilization on ScyllaDB node"
description: "CPU utilization is {{ $value }} on {{ $labels.instance }}"
- alert: ScyllaHighLatency
expr: scylla_storage_proxy_coordinator_read_latency{quantile="0.99"} > 100000
for: 2m
labels:
severity: warning
annotations:
summary: "High read latency detected"
description: "99th percentile read latency is {{ $value }}us on {{ $labels.instance }}"
- alert: ScyllaLowDiskSpace
expr: scylla_node_filesystem_avail_bytes / scylla_node_filesystem_size_bytes < 0.1
for: 1m
labels:
severity: critical
annotations:
summary: "Low disk space on ScyllaDB node"
description: "Available disk space is below 10% on {{ $labels.instance }}"
- alert: ScyllaCompactionBacklog
expr: scylla_compaction_manager_pending_tasks > 100
for: 10m
labels:
severity: warning
annotations:
summary: "High compaction backlog"
description: "Compaction backlog has {{ $value }} pending tasks on {{ $labels.instance }}"
EOF
chown -R "$PROMETHEUS_USER":"$PROMETHEUS_USER" "$PROMETHEUS_CONFIG"
chmod 644 "$PROMETHEUS_CONFIG"/*.yml
print_status "[7/8] Creating systemd service..."
cat > /etc/systemd/system/prometheus.service << EOF
[Unit]
Description=Prometheus
Wants=network-online.target
After=network-online.target
[Service]
User=$PROMETHEUS_USER
Group=$PROMETHEUS_USER
Type=simple
ExecStart=$PROMETHEUS_HOME/prometheus \\
--config.file=$PROMETHEUS_CONFIG/prometheus.yml \\
--storage.tsdb.path=$PROMETHEUS_DATA \\
--web.console.templates=$PROMETHEUS_HOME/consoles \\
--web.console.libraries=$PROMETHEUS_HOME/console_libraries \\
--web.listen-address=0.0.0.0:9090 \\
--web.enable-lifecycle
ExecReload=/bin/kill -HUP \$MAINPID
Restart=always
[Install]
WantedBy=multi-user.target
EOF
print_status "[8/8] Starting and enabling Prometheus service..."
systemctl daemon-reload
systemctl enable prometheus
systemctl start prometheus
# Configure firewall
if command -v firewall-cmd &> /dev/null && systemctl is-active firewalld &> /dev/null; then
firewall-cmd --permanent --add-port=9090/tcp
firewall-cmd --reload
elif command -v ufw &> /dev/null && ufw status | grep -q "Status: active"; then
ufw allow 9090/tcp
fi
# Cleanup temporary files
rm -f "/tmp/prometheus-${PROMETHEUS_VERSION}.linux-amd64.tar.gz"
print_status "Verifying installation..."
if systemctl is-active prometheus &> /dev/null; then
print_status "✓ Prometheus service is running"
else
print_error "✗ Prometheus service failed to start"
exit 1
fi
if promtool check config "$PROMETHEUS_CONFIG/prometheus.yml" &> /dev/null; then
print_status "✓ Prometheus configuration is valid"
else
print_error "✗ Prometheus configuration is invalid"
exit 1
fi
print_status "ScyllaDB monitoring setup completed successfully!"
echo
echo "Prometheus is now running on: http://$(hostname -I | awk '{print $1}'):9090"
echo "Configuration files:"
echo " - Main config: $PROMETHEUS_CONFIG/prometheus.yml"
echo " - Alert rules: $PROMETHEUS_CONFIG/scylla_rules.yml"
echo
if [[ -z "$SCYLLA_NODES" ]]; then
print_warning "No ScyllaDB nodes specified. Edit $PROMETHEUS_CONFIG/prometheus.yml to add your ScyllaDB nodes."
fi
echo "Next steps:"
echo "1. Install and configure Grafana for visualization"
echo "2. Install Alertmanager for alert notifications"
echo "3. Verify ScyllaDB nodes are exposing metrics on port 9180"
Review the script before running. Execute with: bash install.sh