Set up comprehensive Elasticsearch cluster monitoring using Prometheus Elasticsearch Exporter and Grafana dashboards. Configure alerting rules for cluster health, performance metrics, and automated notifications.
Prerequisites
- Running Elasticsearch cluster
- Prometheus server installed
- Grafana server installed
- Basic knowledge of metrics and alerting
What this solves
Elasticsearch clusters require continuous monitoring to ensure optimal performance, prevent data loss, and detect issues before they impact your applications. This tutorial shows you how to implement production-grade monitoring using Prometheus to collect Elasticsearch metrics and Grafana to visualize cluster health, performance, and resource utilization with automated alerting.
Step-by-step installation
Update system packages
Start by updating your package manager to ensure you get the latest versions of all required components.
sudo apt update && sudo apt upgrade -y
Install Prometheus Elasticsearch Exporter
Download and install the official Elasticsearch exporter that will collect metrics from your Elasticsearch cluster and expose them in Prometheus format.
cd /tmp
wget https://github.com/prometheus-community/elasticsearch_exporter/releases/download/v1.7.0/elasticsearch_exporter-1.7.0.linux-amd64.tar.gz
tar -xzf elasticsearch_exporter-1.7.0.linux-amd64.tar.gz
sudo mv elasticsearch_exporter-1.7.0.linux-amd64/elasticsearch_exporter /usr/local/bin/
sudo chmod +x /usr/local/bin/elasticsearch_exporter
Create Elasticsearch exporter service user
Create a dedicated system user for running the Elasticsearch exporter service with minimal privileges.
sudo useradd --no-create-home --shell /bin/false elasticsearch_exporter
Configure Elasticsearch exporter systemd service
Create a systemd service file to manage the Elasticsearch exporter process and ensure it starts automatically on boot.
[Unit]
Description=Elasticsearch Exporter
After=network.target
[Service]
Type=simple
User=elasticsearch_exporter
Group=elasticsearch_exporter
ExecStart=/usr/local/bin/elasticsearch_exporter \
--es.uri=http://localhost:9200 \
--es.all \
--es.indices \
--es.indices_settings \
--es.shards \
--es.snapshots \
--es.timeout=30s \
--web.listen-address=:9114 \
--web.telemetry-path=/metrics
Restart=always
RestartSec=3
[Install]
WantedBy=multi-user.target
Start and enable Elasticsearch exporter
Enable the service to start automatically on boot and start it immediately to begin collecting metrics.
sudo systemctl daemon-reload
sudo systemctl enable --now elasticsearch_exporter
sudo systemctl status elasticsearch_exporter
Configure Prometheus to scrape Elasticsearch metrics
Add the Elasticsearch exporter as a scrape target in your Prometheus configuration to collect metrics every 15 seconds.
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- "elasticsearch_alerts.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'elasticsearch'
static_configs:
- targets: ['localhost:9114']
scrape_interval: 15s
metrics_path: /metrics
Create Elasticsearch alerting rules
Define alerting rules to monitor critical Elasticsearch metrics including cluster health, node availability, and performance thresholds.
groups:
- name: elasticsearch
rules:
- alert: ElasticsearchClusterRed
expr: elasticsearch_cluster_health_status{color="red"} == 1
for: 0m
labels:
severity: critical
annotations:
summary: Elasticsearch Cluster Red (instance {{ $labels.instance }})
description: "Elastic Search Cluster is Red\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ElasticsearchClusterYellow
expr: elasticsearch_cluster_health_status{color="yellow"} == 1
for: 2m
labels:
severity: warning
annotations:
summary: Elasticsearch Cluster Yellow (instance {{ $labels.instance }})
description: "Elastic Search Cluster is Yellow for 2 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ElasticsearchNodeDown
expr: elasticsearch_cluster_health_number_of_nodes < 3
for: 1m
labels:
severity: critical
annotations:
summary: Elasticsearch node down (instance {{ $labels.instance }})
description: "Missing node in Elasticsearch cluster\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ElasticsearchDiskSpaceLow
expr: elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10
for: 2m
labels:
severity: warning
annotations:
summary: Elasticsearch disk space low (instance {{ $labels.instance }})
description: "Elasticsearch node disk usage is above 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ElasticsearchHeapUsageHigh
expr: elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"} * 100 > 90
for: 2m
labels:
severity: warning
annotations:
summary: Elasticsearch heap usage high (instance {{ $labels.instance }})
description: "Elasticsearch heap usage is above 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ElasticsearchIndexingErrors
expr: rate(elasticsearch_indices_indexing_index_failed_total[5m]) > 0
for: 1m
labels:
severity: warning
annotations:
summary: Elasticsearch indexing errors (instance {{ $labels.instance }})
description: "Elasticsearch indexing errors detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ElasticsearchSearchLatencyHigh
expr: elasticsearch_indices_search_query_time_seconds / elasticsearch_indices_search_query_total > 1
for: 2m
labels:
severity: warning
annotations:
summary: Elasticsearch search latency high (instance {{ $labels.instance }})
description: "Elasticsearch search latency is above 1 second\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ElasticsearchPendingTasks
expr: elasticsearch_cluster_health_number_of_pending_tasks > 0
for: 5m
labels:
severity: warning
annotations:
summary: Elasticsearch pending tasks (instance {{ $labels.instance }})
description: "Elasticsearch has pending tasks for 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ElasticsearchRelocatingShards
expr: elasticsearch_cluster_health_relocating_shards > 0
for: 15m
labels:
severity: warning
annotations:
summary: Elasticsearch relocating shards (instance {{ $labels.instance }})
description: "Elasticsearch has relocating shards for 15 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ElasticsearchUnassignedShards
expr: elasticsearch_cluster_health_unassigned_shards > 0
for: 5m
labels:
severity: critical
annotations:
summary: Elasticsearch unassigned shards (instance {{ $labels.instance }})
description: "Elasticsearch has unassigned shards\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
Restart Prometheus service
Restart Prometheus to load the new configuration and alerting rules for Elasticsearch monitoring.
sudo systemctl restart prometheus
sudo systemctl status prometheus
Import Elasticsearch Grafana dashboard
Import a pre-built Elasticsearch dashboard to visualize cluster metrics, or create a custom dashboard with essential monitoring panels.
curl -X POST http://admin:admin@localhost:3000/api/dashboards/db \
-H "Content-Type: application/json" \
-d '{
"dashboard": {
"id": null,
"title": "Elasticsearch Cluster Monitoring",
"tags": ["elasticsearch"],
"timezone": "browser",
"panels": [
{
"id": 1,
"title": "Cluster Status",
"type": "stat",
"targets": [
{
"expr": "elasticsearch_cluster_health_status",
"legendFormat": "{{color}}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"mappings": [
{
"options": {
"0": {
"text": "Green",
"color": "green"
},
"1": {
"text": "Yellow",
"color": "yellow"
},
"2": {
"text": "Red",
"color": "red"
}
},
"type": "value"
}
]
}
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
}
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {},
"templating": {
"list": []
},
"annotations": {
"list": []
},
"refresh": "30s",
"schemaVersion": 16,
"version": 0,
"links": []
}
}'
Configure Grafana data source
Add Prometheus as a data source in Grafana if not already configured, pointing to your Prometheus instance.
curl -X POST http://admin:admin@localhost:3000/api/datasources \
-H "Content-Type: application/json" \
-d '{
"name": "Prometheus",
"type": "prometheus",
"url": "http://localhost:9090",
"access": "proxy",
"isDefault": true
}'
Create comprehensive monitoring dashboard
Set up a detailed dashboard with panels for cluster health, node status, indexing performance, search latency, and resource utilization.
{
"dashboard": {
"id": null,
"title": "Elasticsearch Cluster Monitoring",
"tags": ["elasticsearch", "monitoring"],
"timezone": "browser",
"panels": [
{
"id": 1,
"title": "Cluster Health Status",
"type": "stat",
"targets": [{
"expr": "elasticsearch_cluster_health_status",
"legendFormat": "Status"
}],
"gridPos": {"h": 4, "w": 6, "x": 0, "y": 0}
},
{
"id": 2,
"title": "Number of Nodes",
"type": "stat",
"targets": [{
"expr": "elasticsearch_cluster_health_number_of_nodes",
"legendFormat": "Nodes"
}],
"gridPos": {"h": 4, "w": 6, "x": 6, "y": 0}
},
{
"id": 3,
"title": "JVM Heap Usage",
"type": "graph",
"targets": [{
"expr": "elasticsearch_jvm_memory_used_bytes{area=\"heap\"} / elasticsearch_jvm_memory_max_bytes{area=\"heap\"} * 100",
"legendFormat": "{{instance}} Heap Usage %"
}],
"yAxes": [{
"unit": "percent",
"max": 100
}],
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 4}
},
{
"id": 4,
"title": "Indexing Rate",
"type": "graph",
"targets": [{
"expr": "rate(elasticsearch_indices_indexing_index_total[5m])",
"legendFormat": "{{instance}} Docs/sec"
}],
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 4}
},
{
"id": 5,
"title": "Search Rate",
"type": "graph",
"targets": [{
"expr": "rate(elasticsearch_indices_search_query_total[5m])",
"legendFormat": "{{instance}} Queries/sec"
}],
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 12}
},
{
"id": 6,
"title": "Disk Usage",
"type": "graph",
"targets": [{
"expr": "(elasticsearch_filesystem_data_size_bytes - elasticsearch_filesystem_data_available_bytes) / elasticsearch_filesystem_data_size_bytes * 100",
"legendFormat": "{{instance}} Disk Usage %"
}],
"yAxes": [{
"unit": "percent",
"max": 100
}],
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 12}
}
],
"time": {"from": "now-1h", "to": "now"},
"refresh": "30s",
"schemaVersion": 27,
"version": 1
}
}
curl -X POST http://admin:admin@localhost:3000/api/dashboards/db \
-H "Content-Type: application/json" \
-d @/tmp/elasticsearch-dashboard.json
Configure alerting notifications
Set up notification channels in Grafana to receive alerts via email, Slack, or other preferred methods when Elasticsearch issues are detected.
curl -X POST http://admin:admin@localhost:3000/api/alert-notifications \
-H "Content-Type: application/json" \
-d '{
"name": "email-alerts",
"type": "email",
"settings": {
"addresses": "admin@example.com",
"subject": "Elasticsearch Alert"
}
}'
Verify your setup
Check that all components are running correctly and collecting Elasticsearch metrics.
sudo systemctl status elasticsearch_exporter
curl http://localhost:9114/metrics | grep elasticsearch_cluster_health
curl http://localhost:9090/api/v1/targets | grep elasticsearch
curl http://admin:admin@localhost:3000/api/datasources
Configure advanced monitoring features
Enable cluster-level metrics collection
Configure additional metrics collection for cluster statistics, shard allocation, and index-level performance data.
[Unit]
Description=Elasticsearch Exporter
After=network.target
[Service]
Type=simple
User=elasticsearch_exporter
Group=elasticsearch_exporter
ExecStart=/usr/local/bin/elasticsearch_exporter \
--es.uri=http://localhost:9200 \
--es.all \
--es.indices \
--es.indices_settings \
--es.indices_mappings \
--es.shards \
--es.snapshots \
--es.cluster_settings \
--es.timeout=30s \
--web.listen-address=:9114 \
--web.telemetry-path=/metrics \
--log.level=info
Restart=always
RestartSec=3
[Install]
WantedBy=multi-user.target
sudo systemctl daemon-reload
sudo systemctl restart elasticsearch_exporter
Set up index-level monitoring
Create specific monitoring rules for critical indices to track their performance, size, and health separately.
groups:
- name: elasticsearch_indices
rules:
- alert: ElasticsearchIndexSizeGrowth
expr: increase(elasticsearch_indices_store_size_bytes[1h]) > 1073741824 # 1GB growth per hour
for: 0m
labels:
severity: warning
annotations:
summary: Elasticsearch index growing rapidly (instance {{ $labels.instance }})
description: "Index {{ $labels.index }} is growing by more than 1GB per hour\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ElasticsearchIndexDocCountDrop
expr: rate(elasticsearch_indices_docs_total[5m]) < -1000
for: 1m
labels:
severity: warning
annotations:
summary: Elasticsearch index document count dropping (instance {{ $labels.instance }})
description: "Index {{ $labels.index }} is losing documents\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ElasticsearchIndexNotUpdated
expr: (time() - elasticsearch_indices_flush_total_time_seconds) > 3600 # No updates for 1 hour
for: 0m
labels:
severity: warning
annotations:
summary: Elasticsearch index not updated (instance {{ $labels.instance }})
description: "Index {{ $labels.index }} has not been updated for over 1 hour\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
Common issues
| Symptom | Cause | Fix |
|---|---|---|
| Exporter fails to connect | Elasticsearch not accessible | Check Elasticsearch is running: curl http://localhost:9200 |
| No metrics in Prometheus | Exporter not being scraped | Verify Prometheus config: curl localhost:9090/targets |
| Dashboard shows no data | Wrong data source or queries | Test queries in Prometheus UI first |
| Alerts not firing | Alert rules syntax error | Check Prometheus logs: journalctl -u prometheus |
| Permission denied errors | Incorrect service user setup | Check user exists: id elasticsearch_exporter |
| High memory usage | Too many metrics being collected | Disable unnecessary flags like --es.indices_mappings |
| SSL connection failures | HTTPS Elasticsearch without TLS config | Use --es.uri=https://localhost:9200 --es.ca=/path/to/ca.crt |
--es.username=monitor --es.password=secret or use environment variables for sensitive data.For clusters requiring more sophisticated monitoring, consider implementing Prometheus federation to aggregate metrics from multiple Elasticsearch clusters.
Next steps
- Set up ClickHouse monitoring with Prometheus and Grafana dashboards
- Configure Elasticsearch cluster backup automation with snapshots
- Implement Elasticsearch security hardening with TLS and authentication
- Set up Elasticsearch log aggregation with Filebeat and Logstash
- Configure Elasticsearch cluster auto-scaling with Kubernetes
Automated install script
Run this to automate the entire setup
#!/usr/bin/env bash
set -euo pipefail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
# Default values
ES_URI="${1:-http://localhost:9200}"
PROMETHEUS_CONFIG="${2:-/etc/prometheus/prometheus.yml}"
# Usage message
usage() {
echo "Usage: $0 [elasticsearch_uri] [prometheus_config_path]"
echo "Example: $0 http://localhost:9200 /etc/prometheus/prometheus.yml"
exit 1
}
# Logging functions
log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
# Cleanup on failure
cleanup() {
log_error "Installation failed. Cleaning up..."
systemctl stop elasticsearch_exporter 2>/dev/null || true
systemctl disable elasticsearch_exporter 2>/dev/null || true
rm -f /etc/systemd/system/elasticsearch_exporter.service
userdel elasticsearch_exporter 2>/dev/null || true
rm -f /usr/local/bin/elasticsearch_exporter
exit 1
}
trap cleanup ERR
# Check if running as root or with sudo
if [[ $EUID -ne 0 ]]; then
log_error "This script must be run as root or with sudo"
exit 1
fi
# Auto-detect distribution
if [ -f /etc/os-release ]; then
. /etc/os-release
case "$ID" in
ubuntu|debian)
PKG_MGR="apt"
PKG_UPDATE="apt update && apt upgrade -y"
PKG_INSTALL="apt install -y"
FIREWALL_CMD="ufw"
;;
almalinux|rocky|centos|rhel|ol|fedora)
PKG_MGR="dnf"
PKG_UPDATE="dnf update -y"
PKG_INSTALL="dnf install -y"
FIREWALL_CMD="firewall-cmd"
;;
amzn)
PKG_MGR="yum"
PKG_UPDATE="yum update -y"
PKG_INSTALL="yum install -y"
FIREWALL_CMD="firewall-cmd"
;;
*)
log_error "Unsupported distribution: $ID"
exit 1
;;
esac
else
log_error "Cannot detect distribution. /etc/os-release not found."
exit 1
fi
log_info "Detected distribution: $ID ($VERSION_ID)"
# Step 1: Update system packages
echo -e "${GREEN}[1/7]${NC} Updating system packages..."
$PKG_UPDATE
# Step 2: Install prerequisites
echo -e "${GREEN}[2/7]${NC} Installing prerequisites..."
$PKG_INSTALL wget tar
# Step 3: Download and install Elasticsearch exporter
echo -e "${GREEN}[3/7]${NC} Installing Elasticsearch exporter..."
cd /tmp
wget -q https://github.com/prometheus-community/elasticsearch_exporter/releases/download/v1.7.0/elasticsearch_exporter-1.7.0.linux-amd64.tar.gz
tar -xzf elasticsearch_exporter-1.7.0.linux-amd64.tar.gz
install -o root -g root -m 755 elasticsearch_exporter-1.7.0.linux-amd64/elasticsearch_exporter /usr/local/bin/
rm -rf elasticsearch_exporter-1.7.0.linux-amd64*
# Step 4: Create service user
echo -e "${GREEN}[4/7]${NC} Creating elasticsearch_exporter user..."
if ! id elasticsearch_exporter &>/dev/null; then
useradd --system --no-create-home --shell /bin/false elasticsearch_exporter
fi
# Step 5: Create systemd service
echo -e "${GREEN}[5/7]${NC} Creating systemd service..."
cat > /etc/systemd/system/elasticsearch_exporter.service << 'EOF'
[Unit]
Description=Elasticsearch Exporter
After=network.target
[Service]
Type=simple
User=elasticsearch_exporter
Group=elasticsearch_exporter
ExecStart=/usr/local/bin/elasticsearch_exporter \
--es.uri=http://localhost:9200 \
--es.all \
--es.indices \
--es.indices_settings \
--es.shards \
--es.snapshots \
--es.timeout=30s \
--web.listen-address=:9114 \
--web.telemetry-path=/metrics
Restart=always
RestartSec=3
NoNewPrivileges=yes
ProtectHome=yes
ProtectSystem=strict
ReadWritePaths=/var/lib/elasticsearch_exporter
[Install]
WantedBy=multi-user.target
EOF
chmod 644 /etc/systemd/system/elasticsearch_exporter.service
# Step 6: Configure firewall
echo -e "${GREEN}[6/7]${NC} Configuring firewall..."
case "$PKG_MGR" in
apt)
if command -v ufw &> /dev/null && ufw status | grep -q "Status: active"; then
ufw allow 9114/tcp comment "Elasticsearch Exporter"
fi
;;
dnf|yum)
if systemctl is-active --quiet firewalld; then
firewall-cmd --permanent --add-port=9114/tcp
firewall-cmd --reload
fi
;;
esac
# Configure SELinux if present
if command -v setsebool &> /dev/null; then
setsebool -P httpd_can_network_connect 1 2>/dev/null || true
fi
# Step 7: Start and enable service
echo -e "${GREEN}[7/7]${NC} Starting elasticsearch_exporter service..."
systemctl daemon-reload
systemctl enable elasticsearch_exporter
systemctl start elasticsearch_exporter
# Wait for service to start
sleep 3
# Verification checks
echo -e "${GREEN}[VERIFY]${NC} Running verification checks..."
# Check if service is running
if systemctl is-active --quiet elasticsearch_exporter; then
log_info "✓ Elasticsearch exporter service is running"
else
log_error "✗ Elasticsearch exporter service is not running"
systemctl status elasticsearch_exporter --no-pager
exit 1
fi
# Check if metrics endpoint is accessible
if curl -sf http://localhost:9114/metrics > /dev/null; then
log_info "✓ Metrics endpoint is accessible"
else
log_warn "✗ Metrics endpoint is not accessible (Elasticsearch may not be running)"
fi
# Display Prometheus configuration snippet
echo ""
log_info "Add this job to your Prometheus configuration:"
echo "
scrape_configs:
- job_name: 'elasticsearch'
static_configs:
- targets: ['localhost:9114']
scrape_interval: 15s
metrics_path: /metrics
"
# Display alerting rules
echo ""
log_info "Sample alerting rules for Elasticsearch:"
echo "Create /etc/prometheus/rules/elasticsearch_alerts.yml with critical monitoring rules"
echo ""
log_info "Installation completed successfully!"
log_info "Elasticsearch exporter is running on port 9114"
log_info "Metrics available at: http://localhost:9114/metrics"
log_info "Service logs: journalctl -u elasticsearch_exporter -f"
Review the script before running. Execute with: bash install.sh