Set up comprehensive monitoring for Apache Cassandra clusters using JMX exporter, Prometheus metrics collection, and Grafana dashboards with alerting rules for cluster health.
Prerequisites
- Apache Cassandra cluster running
- Root or sudo access
- Network connectivity between nodes
- Basic knowledge of Prometheus and Grafana
What this solves
Apache Cassandra clusters generate hundreds of performance and health metrics through JMX, but without proper monitoring, you'll miss critical issues like node failures, disk space problems, or read/write latency spikes. This tutorial configures JMX exporter to expose Cassandra metrics, sets up Prometheus to collect them, and creates Grafana dashboards with alerting rules for comprehensive cluster monitoring.
Step-by-step configuration
Install JMX Prometheus exporter
Download and configure the JMX exporter to expose Cassandra metrics in Prometheus format.
cd /opt
sudo wget https://repo1.maven.org/maven2/io/prometheus/jmx/jmx_prometheus_javaagent/0.20.0/jmx_prometheus_javaagent-0.20.0.jar
sudo chown cassandra:cassandra jmx_prometheus_javaagent-0.20.0.jar
Create JMX exporter configuration
Configure the JMX exporter to collect essential Cassandra metrics including node health, keyspace metrics, and thread pool statistics.
rules:
# Node health metrics
- pattern: "org.apache.cassandra.metrics<>Value"
name: cassandra_storage_load_bytes
help: "Total disk space used by node in bytes"
# Read/Write latency
- pattern: "org.apache.cassandra.metrics<>Count"
name: cassandra_client_request_latency_total
labels:
request_type: "$1"
help: "Total client request latency count"
- pattern: "org.apache.cassandra.metrics<>(Mean|95thPercentile|99thPercentile)"
name: cassandra_client_request_latency_seconds
type: GAUGE
labels:
request_type: "$1"
quantile: "$2"
help: "Client request latency in seconds"
# Connection metrics
- pattern: "org.apache.cassandra.metrics<>Value"
name: cassandra_connection_$2
labels:
connection_type: "$1"
help: "Cassandra connection metrics"
# Keyspace metrics
- pattern: "org.apache.cassandra.metrics<>(Count|Value)"
name: cassandra_keyspace_$2
labels:
keyspace: "$1"
help: "Cassandra keyspace metrics"
# Table metrics
- pattern: "org.apache.cassandra.metrics<>(Count|Value)"
name: cassandra_table_$3
labels:
keyspace: "$1"
table: "$2"
help: "Cassandra table metrics"
# Thread pool metrics
- pattern: "org.apache.cassandra.metrics<>Value"
name: cassandra_threadpool_$3
labels:
pool_type: "$1"
pool_name: "$2"
help: "Cassandra thread pool metrics"
# Compaction metrics
- pattern: "org.apache.cassandra.metrics<>(Count|Value)"
name: cassandra_compaction_$1
help: "Cassandra compaction metrics"
# Cache metrics
- pattern: "org.apache.cassandra.metrics<>(Count|Value)"
name: cassandra_cache_$2
labels:
cache_name: "$1"
help: "Cassandra cache metrics"
Configure Cassandra with JMX exporter
Add the JMX exporter as a Java agent to Cassandra's JVM startup options.
# Add JMX Prometheus exporter
JVM_OPTS="$JVM_OPTS -javaagent:/opt/jmx_prometheus_javaagent-0.20.0.jar=7070:/opt/cassandra-jmx-config.yaml"
Restart Cassandra service
Restart Cassandra to load the JMX exporter configuration.
sudo systemctl restart cassandra
sudo systemctl status cassandra
Verify JMX exporter is working
Check that the JMX exporter is exposing metrics on port 7070.
curl http://localhost:7070/metrics | grep cassandra_storage_load_bytes
ss -tlnp | grep 7070
Install Prometheus
Install Prometheus to collect metrics from the Cassandra JMX exporter.
sudo apt update
sudo apt install -y prometheus
Configure Prometheus to scrape Cassandra metrics
Add Cassandra nodes to Prometheus configuration for metric collection.
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- "cassandra_alerts.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
- localhost:9093
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'cassandra-cluster'
static_configs:
- targets:
- 'cassandra-node-1:7070'
- 'cassandra-node-2:7070'
- 'cassandra-node-3:7070'
scrape_interval: 30s
scrape_timeout: 10s
metrics_path: /metrics
params:
format: ['prometheus']
Create Cassandra alerting rules
Define alert rules for critical Cassandra cluster conditions.
groups:
- name: cassandra_cluster
rules:
# Node availability
- alert: CassandraNodeDown
expr: up{job="cassandra-cluster"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Cassandra node {{ $labels.instance }} is down"
description: "Cassandra node {{ $labels.instance }} has been down for more than 2 minutes."
# High read latency
- alert: CassandraHighReadLatency
expr: cassandra_client_request_latency_seconds{request_type="Read", quantile="95thPercentile"} > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "High Cassandra read latency on {{ $labels.instance }}"
description: "95th percentile read latency is {{ $value }}s on {{ $labels.instance }}."
# High write latency
- alert: CassandraHighWriteLatency
expr: cassandra_client_request_latency_seconds{request_type="Write", quantile="95thPercentile"} > 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "High Cassandra write latency on {{ $labels.instance }}"
description: "95th percentile write latency is {{ $value }}s on {{ $labels.instance }}."
# Disk space usage
- alert: CassandraHighDiskUsage
expr: (cassandra_storage_load_bytes / (1024^3)) > 80
for: 10m
labels:
severity: warning
annotations:
summary: "High disk usage on Cassandra node {{ $labels.instance }}"
description: "Disk usage is {{ $value }}GB on {{ $labels.instance }}."
# Pending compactions
- alert: CassandraHighPendingCompactions
expr: cassandra_compaction_PendingTasks > 20
for: 15m
labels:
severity: warning
annotations:
summary: "High pending compactions on {{ $labels.instance }}"
description: "{{ $value }} compactions are pending on {{ $labels.instance }}."
# Thread pool queue size
- alert: CassandraHighThreadPoolQueue
expr: cassandra_threadpool_PendingTasks{pool_name="MutationStage"} > 100
for: 5m
labels:
severity: warning
annotations:
summary: "High thread pool queue on {{ $labels.instance }}"
description: "{{ $labels.pool_name }} has {{ $value }} pending tasks on {{ $labels.instance }}."
Start and enable Prometheus
Enable Prometheus to start automatically and verify it's collecting metrics.
sudo systemctl enable --now prometheus
sudo systemctl status prometheus
Install Grafana
Install Grafana for visualizing Cassandra cluster metrics.
sudo apt install -y software-properties-common
wget -q -O - https://packages.grafana.com/gpg.key | sudo apt-key add -
echo "deb https://packages.grafana.com/oss/deb stable main" | sudo tee -a /etc/apt/sources.list.d/grafana.list
sudo apt update
sudo apt install -y grafana
Start and enable Grafana
Enable Grafana service and access the web interface.
sudo systemctl enable --now grafana-server
sudo systemctl status grafana-server
Configure Grafana data source
Add Prometheus as a data source in Grafana. Navigate to http://your-server:3000 (admin/admin), then go to Configuration > Data Sources > Add data source.
Name: Prometheus
Type: Prometheus
URL: http://localhost:9090
Access: Server (default)
HTTP Method: GET
Create Cassandra cluster overview dashboard
Create a comprehensive dashboard for monitoring cluster health. In Grafana, go to Dashboards > New > New Dashboard and add these panels.
# Node availability
up{job="cassandra-cluster"}
Total nodes
count(up{job="cassandra-cluster"})
Healthy nodes
count(up{job="cassandra-cluster"} == 1)
Add read/write latency panels
Monitor client request latencies across the cluster.
# Read latency 95th percentile
cassandra_client_request_latency_seconds{request_type="Read", quantile="95thPercentile"}
Write latency 95th percentile
cassandra_client_request_latency_seconds{request_type="Write", quantile="95thPercentile"}
Read throughput
rate(cassandra_client_request_latency_total{request_type="Read"}[5m])
Write throughput
rate(cassandra_client_request_latency_total{request_type="Write"}[5m])
Add storage and compaction panels
Monitor disk usage and compaction activity.
# Disk usage per node (GB)
cassandra_storage_load_bytes / (1024^3)
Pending compactions
cassandra_compaction_PendingTasks
Completed compactions rate
rate(cassandra_compaction_CompletedTasks[5m])
Add thread pool monitoring
Monitor thread pool health and queue sizes.
# Active tasks
cassandra_threadpool_ActiveTasks
Pending tasks
cassandra_threadpool_PendingTasks
Blocked tasks
cassandra_threadpool_CurrentlyBlockedTasks
Configure alerting notifications
Set up notification channels for Grafana alerts. Go to Alerting > Notification channels.
Name: cassandra-alerts
Type: Email
Addresses: ops-team@example.com
Subject: Cassandra Alert - {{ range .Alerts }}{{ .Annotations.summary }}{{ end }}
Create alerting rules in Grafana
Configure dashboard alerts for critical metrics. Edit each panel and go to Alert tab.
Condition: IS BELOW 1
Evaluation: every 1m for 2m
Notifications: Send to cassandra-alerts
Message: Cassandra node is down - check cluster status immediately
Export and save dashboard
Save your dashboard configuration for backup and version control.
curl -u admin:admin http://localhost:3000/api/dashboards/db/cassandra-cluster > cassandra-dashboard.json
Verify your setup
Confirm that your monitoring stack is collecting and displaying Cassandra metrics correctly.
# Check Cassandra JMX exporter
curl -s http://localhost:7070/metrics | grep -c cassandra_
Verify Prometheus targets
curl -s http://localhost:9090/api/v1/targets | jq '.data.activeTargets[].health'
Test a Prometheus query
curl -s "http://localhost:9090/api/v1/query?query=up{job='cassandra-cluster'}" | jq '.data.result[].value[1]'
Check Grafana is running
curl -s http://localhost:3000/api/health | jq '.database'
Verify alert rules are loaded
curl -s http://localhost:9090/api/v1/rules | jq '.data.groups[].name'
In Grafana, verify you can see:
- All Cassandra nodes showing as "up" in the cluster status panel
- Read and write latency metrics updating in real-time
- Storage usage data for each node
- Thread pool activity and queue sizes
Common issues
| Symptom | Cause | Fix |
|---|---|---|
| JMX exporter port not accessible | Firewall blocking port 7070 | sudo ufw allow 7070 or configure iptables |
| Prometheus shows targets as down | Incorrect hostnames in config | Use IP addresses or verify DNS resolution |
| Missing Cassandra metrics in Prometheus | JMX exporter not loaded correctly | Check /var/log/cassandra/system.log for agent errors |
| High memory usage after enabling monitoring | Too frequent scraping or large metric cardinality | Increase scrape_interval to 60s and filter unused metrics |
| Grafana shows no data | Data source URL incorrect | Verify Prometheus URL is http://localhost:9090 |
| Alerts not firing | Alert rule syntax errors | Validate rules with promtool check rules cassandra_alerts.yml |
Next steps
- Configure Prometheus alerting rules for cgroup metrics monitoring to expand your monitoring coverage
- Set up centralized log aggregation with ELK stack for Cassandra log analysis
- Configure automated Cassandra backup strategies with nodetool
- Implement Cassandra performance tuning and optimization
- Set up multi-datacenter Cassandra replication monitoring
Running this in production?
Automated install script
Run this to automate the entire setup
#!/usr/bin/env bash
set -euo pipefail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# Default configuration
CASSANDRA_USER="${CASSANDRA_USER:-cassandra}"
JMX_EXPORTER_PORT="${JMX_EXPORTER_PORT:-7070}"
PROMETHEUS_PORT="${PROMETHEUS_PORT:-9090}"
# Usage function
usage() {
echo "Usage: $0 [OPTIONS]"
echo "Install and configure Cassandra monitoring with Prometheus and Grafana"
echo ""
echo "Options:"
echo " -h, --help Show this help message"
echo " --cassandra-user USER Cassandra user (default: cassandra)"
echo " --jmx-port PORT JMX exporter port (default: 7070)"
echo " --prometheus-port PORT Prometheus port (default: 9090)"
echo ""
exit 1
}
# Parse arguments
while [[ $# -gt 0 ]]; do
case $1 in
-h|--help)
usage
;;
--cassandra-user)
CASSANDRA_USER="$2"
shift 2
;;
--jmx-port)
JMX_EXPORTER_PORT="$2"
shift 2
;;
--prometheus-port)
PROMETHEUS_PORT="$2"
shift 2
;;
*)
echo -e "${RED}Error: Unknown option $1${NC}"
usage
;;
esac
done
# Cleanup function for trap
cleanup() {
echo -e "${RED}Installation failed! Cleaning up...${NC}"
systemctl stop prometheus 2>/dev/null || true
systemctl stop cassandra 2>/dev/null || true
}
trap cleanup ERR
# Check if running as root or with sudo
if [[ $EUID -ne 0 ]]; then
echo -e "${RED}Error: This script must be run as root or with sudo${NC}"
exit 1
fi
# Detect distribution
echo -e "${YELLOW}[1/10] Detecting Linux distribution...${NC}"
if [ -f /etc/os-release ]; then
. /etc/os-release
case "$ID" in
ubuntu|debian)
PKG_MGR="apt"
PKG_INSTALL="apt install -y"
PKG_UPDATE="apt update"
PROMETHEUS_PKG="prometheus"
PROMETHEUS_CONFIG="/etc/prometheus/prometheus.yml"
PROMETHEUS_SERVICE="prometheus"
;;
almalinux|rocky|centos|rhel|ol|fedora)
PKG_MGR="dnf"
PKG_INSTALL="dnf install -y"
PKG_UPDATE="dnf update -y"
PROMETHEUS_PKG="prometheus2"
PROMETHEUS_CONFIG="/etc/prometheus/prometheus.yml"
PROMETHEUS_SERVICE="prometheus"
;;
amzn)
PKG_MGR="yum"
PKG_INSTALL="yum install -y"
PKG_UPDATE="yum update -y"
PROMETHEUS_PKG="prometheus2"
PROMETHEUS_CONFIG="/etc/prometheus/prometheus.yml"
PROMETHEUS_SERVICE="prometheus"
;;
*)
echo -e "${RED}Error: Unsupported distribution: $ID${NC}"
exit 1
;;
esac
echo -e "${GREEN}Detected: $PRETTY_NAME${NC}"
else
echo -e "${RED}Error: Cannot detect Linux distribution${NC}"
exit 1
fi
# Check prerequisites
echo -e "${YELLOW}[2/10] Checking prerequisites...${NC}"
command -v wget >/dev/null 2>&1 || $PKG_INSTALL wget
command -v curl >/dev/null 2>&1 || $PKG_INSTALL curl
command -v ss >/dev/null 2>&1 || $PKG_INSTALL iproute2
# Check if Cassandra user exists
if ! id "$CASSANDRA_USER" &>/dev/null; then
echo -e "${RED}Error: Cassandra user '$CASSANDRA_USER' does not exist${NC}"
echo "Please install Cassandra first or specify correct user with --cassandra-user"
exit 1
fi
# Download JMX Prometheus exporter
echo -e "${YELLOW}[3/10] Downloading JMX Prometheus exporter...${NC}"
cd /opt
if [ ! -f jmx_prometheus_javaagent-0.20.0.jar ]; then
wget -q https://repo1.maven.org/maven2/io/prometheus/jmx/jmx_prometheus_javaagent/0.20.0/jmx_prometheus_javaagent-0.20.0.jar
chown $CASSANDRA_USER:$CASSANDRA_USER jmx_prometheus_javaagent-0.20.0.jar
chmod 644 jmx_prometheus_javaagent-0.20.0.jar
echo -e "${GREEN}JMX exporter downloaded successfully${NC}"
else
echo -e "${GREEN}JMX exporter already exists${NC}"
fi
# Create JMX exporter configuration
echo -e "${YELLOW}[4/10] Creating JMX exporter configuration...${NC}"
cat > /opt/cassandra-jmx-config.yaml << 'EOF'
rules:
# Node health metrics
- pattern: "org.apache.cassandra.metrics<type=Storage, name=Load><>Value"
name: cassandra_storage_load_bytes
help: "Total disk space used by node in bytes"
# Read/Write latency
- pattern: "org.apache.cassandra.metrics<type=ClientRequest, scope=([^,]+), name=Latency><>Count"
name: cassandra_client_request_latency_total
labels:
request_type: "$1"
help: "Total client request latency count"
- pattern: "org.apache.cassandra.metrics<type=ClientRequest, scope=([^,]+), name=Latency><>(Mean|95thPercentile|99thPercentile)"
name: cassandra_client_request_latency_seconds
type: GAUGE
labels:
request_type: "$1"
quantile: "$2"
help: "Client request latency in seconds"
valueFactor: 0.000001
# Connection metrics
- pattern: "org.apache.cassandra.metrics<type=Connection, scope=([^,]+), name=([^,]+)><>Value"
name: cassandra_connection_$2
labels:
connection_type: "$1"
help: "Cassandra connection metrics"
# Thread pool metrics
- pattern: "org.apache.cassandra.metrics<type=ThreadPools, path=([^,]+), scope=([^,]+), name=([^,]+)><>Value"
name: cassandra_threadpool_$3
labels:
pool_type: "$1"
pool_name: "$2"
help: "Cassandra thread pool metrics"
# Compaction metrics
- pattern: "org.apache.cassandra.metrics<type=Compaction, name=([^,]+)><>(Count|Value)"
name: cassandra_compaction_$1
help: "Cassandra compaction metrics"
# Cache metrics
- pattern: "org.apache.cassandra.metrics<type=Cache, scope=([^,]+), name=([^,]+)><>(Count|Value)"
name: cassandra_cache_$2
labels:
cache_name: "$1"
help: "Cassandra cache metrics"
EOF
chown $CASSANDRA_USER:$CASSANDRA_USER /opt/cassandra-jmx-config.yaml
chmod 644 /opt/cassandra-jmx-config.yaml
# Configure Cassandra with JMX exporter
echo -e "${YELLOW}[5/10] Configuring Cassandra with JMX exporter...${NC}"
JVM_OPTS_FILE=""
if [ -f /etc/cassandra/cassandra-env.sh ]; then
JVM_OPTS_FILE="/etc/cassandra/cassandra-env.sh"
elif [ -f /etc/cassandra/conf/cassandra-env.sh ]; then
JVM_OPTS_FILE="/etc/cassandra/conf/cassandra-env.sh"
elif [ -f /opt/cassandra/conf/cassandra-env.sh ]; then
JVM_OPTS_FILE="/opt/cassandra/conf/cassandra-env.sh"
else
echo -e "${RED}Error: Cannot find cassandra-env.sh file${NC}"
exit 1
fi
if ! grep -q "jmx_prometheus_javaagent" "$JVM_OPTS_FILE"; then
cp "$JVM_OPTS_FILE" "$JVM_OPTS_FILE.backup"
echo "" >> "$JVM_OPTS_FILE"
echo "# JMX Prometheus exporter" >> "$JVM_OPTS_FILE"
echo "JVM_OPTS=\"\$JVM_OPTS -javaagent:/opt/jmx_prometheus_javaagent-0.20.0.jar=$JMX_EXPORTER_PORT:/opt/cassandra-jmx-config.yaml\"" >> "$JVM_OPTS_FILE"
echo -e "${GREEN}JMX exporter added to Cassandra configuration${NC}"
else
echo -e "${GREEN}JMX exporter already configured${NC}"
fi
# Install Prometheus
echo -e "${YELLOW}[6/10] Installing Prometheus...${NC}"
if [ "$PKG_MGR" = "apt" ]; then
$PKG_UPDATE
fi
$PKG_INSTALL $PROMETHEUS_PKG
# Create Prometheus configuration
echo -e "${YELLOW}[7/10] Configuring Prometheus...${NC}"
mkdir -p $(dirname $PROMETHEUS_CONFIG)
cat > $PROMETHEUS_CONFIG << EOF
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:$PROMETHEUS_PORT']
- job_name: 'cassandra-cluster'
static_configs:
- targets: ['localhost:$JMX_EXPORTER_PORT']
scrape_interval: 30s
metrics_path: /metrics
EOF
chown prometheus:prometheus $PROMETHEUS_CONFIG 2>/dev/null || chown root:root $PROMETHEUS_CONFIG
chmod 644 $PROMETHEUS_CONFIG
# Configure firewall
echo -e "${YELLOW}[8/10] Configuring firewall...${NC}"
if command -v firewall-cmd >/dev/null 2>&1 && systemctl is-active --quiet firewalld; then
firewall-cmd --permanent --add-port=$JMX_EXPORTER_PORT/tcp
firewall-cmd --permanent --add-port=$PROMETHEUS_PORT/tcp
firewall-cmd --reload
echo -e "${GREEN}Firewalld configured${NC}"
elif command -v ufw >/dev/null 2>&1 && ufw status | grep -q "Status: active"; then
ufw allow $JMX_EXPORTER_PORT/tcp
ufw allow $PROMETHEUS_PORT/tcp
echo -e "${GREEN}UFW configured${NC}"
else
echo -e "${YELLOW}No active firewall detected, skipping firewall configuration${NC}"
fi
# Start services
echo -e "${YELLOW}[9/10] Starting services...${NC}"
systemctl restart cassandra
systemctl enable prometheus
systemctl start prometheus
# Wait for services to start
sleep 10
# Verify installation
echo -e "${YELLOW}[10/10] Verifying installation...${NC}"
# Check Cassandra is running
if systemctl is-active --quiet cassandra; then
echo -e "${GREEN}✓ Cassandra is running${NC}"
else
echo -e "${RED}✗ Cassandra is not running${NC}"
exit 1
fi
# Check JMX exporter port
if ss -tlnp | grep -q ":$JMX_EXPORTER_PORT "; then
echo -e "${GREEN}✓ JMX exporter is listening on port $JMX_EXPORTER_PORT${NC}"
else
echo -e "${RED}✗ JMX exporter is not listening on port $JMX_EXPORTER_PORT${NC}"
exit 1
fi
# Check Prometheus is running
if systemctl is-active --quiet prometheus; then
echo -e "${GREEN}✓ Prometheus is running${NC}"
else
echo -e "${RED}✗ Prometheus is not running${NC}"
exit 1
fi
# Test metrics endpoint
if curl -s http://localhost:$JMX_EXPORTER_PORT/metrics | grep -q "cassandra_"; then
echo -e "${GREEN}✓ Cassandra metrics are being exposed${NC}"
else
echo -e "${YELLOW}⚠ Cassandra metrics not yet available (may need more time to start)${NC}"
fi
echo -e "${GREEN}"
echo "=================================="
echo "Installation completed successfully!"
echo "=================================="
echo -e "${NC}"
echo "Access points:"
echo "- Cassandra JMX metrics: http://localhost:$JMX_EXPORTER_PORT/metrics"
echo "- Prometheus: http://localhost:$PROMETHEUS_PORT"
echo ""
echo "Configuration files:"
echo "- JMX exporter config: /opt/cassandra-jmx-config.yaml"
echo "- Prometheus config: $PROMETHEUS_CONFIG"
echo ""
echo "Next steps:"
echo "1. Install Grafana for visualization"
echo "2. Import Cassandra dashboard templates"
echo "3. Configure alerting rules"
Review the script before running. Execute with: bash install.sh