Set up comprehensive monitoring and alerting for Jaeger distributed tracing using Prometheus metrics collection and Grafana dashboards with automated alerts for trace performance issues.
Prerequisites
- Jaeger already installed and running
- Root or sudo access
- At least 4GB RAM available
What this solves
Jaeger provides distributed tracing for microservices, but without proper alerting you won't know when trace collection fails or performance degrades. This tutorial integrates Jaeger with Prometheus for metrics collection and Grafana for visualization and alerting, giving you complete observability into your distributed tracing infrastructure.
Step-by-step configuration
Install Prometheus
First, install Prometheus to collect metrics from Jaeger components.
sudo apt update
wget https://github.com/prometheus/prometheus/releases/download/v2.47.2/prometheus-2.47.2.linux-amd64.tar.gz
tar xvfz prometheus-*.tar.gz
sudo mkdir -p /opt/prometheus
sudo cp prometheus-*/prometheus /opt/prometheus/
sudo cp prometheus-*/promtool /opt/prometheus/
sudo cp -r prometheus-*/consoles /opt/prometheus/
sudo cp -r prometheus-*/console_libraries /opt/prometheus/
Create Prometheus user and directories
Set up dedicated user and directories for Prometheus with proper permissions.
sudo useradd --no-create-home --shell /bin/false prometheus
sudo mkdir -p /etc/prometheus
sudo mkdir -p /var/lib/prometheus
sudo chown prometheus:prometheus /opt/prometheus/prometheus
sudo chown prometheus:prometheus /opt/prometheus/promtool
sudo chown -R prometheus:prometheus /opt/prometheus/consoles
sudo chown -R prometheus:prometheus /opt/prometheus/console_libraries
sudo chown -R prometheus:prometheus /etc/prometheus
sudo chown -R prometheus:prometheus /var/lib/prometheus
Configure Prometheus for Jaeger metrics
Create Prometheus configuration to scrape metrics from Jaeger components including query, collector, and agent services.
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- "jaeger_rules.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
- localhost:9093
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'jaeger-query'
static_configs:
- targets: ['localhost:16687']
metrics_path: /metrics
scrape_interval: 15s
- job_name: 'jaeger-collector'
static_configs:
- targets: ['localhost:14269']
metrics_path: /metrics
scrape_interval: 15s
- job_name: 'jaeger-agent'
static_configs:
- targets: ['localhost:14271']
metrics_path: /metrics
scrape_interval: 15s
- job_name: 'node-exporter'
static_configs:
- targets: ['localhost:9100']
Create Jaeger alerting rules
Define Prometheus alerting rules for Jaeger performance and health monitoring.
groups:
- name: jaeger_alerts
rules:
- alert: JaegerCollectorDown
expr: up{job="jaeger-collector"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Jaeger collector is down"
description: "Jaeger collector has been down for more than 1 minute"
- alert: JaegerQueryDown
expr: up{job="jaeger-query"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Jaeger query service is down"
description: "Jaeger query service has been down for more than 1 minute"
- alert: JaegerHighSpanDropRate
expr: rate(jaeger_collector_spans_dropped_total[5m]) > 100
for: 2m
labels:
severity: warning
annotations:
summary: "High span drop rate in Jaeger collector"
description: "Jaeger collector is dropping {{ $value }} spans per second"
- alert: JaegerHighTraceLatency
expr: histogram_quantile(0.95, rate(jaeger_query_latency_bucket[5m])) > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "High trace query latency"
description: "95th percentile query latency is {{ $value }}ms"
- alert: JaegerStorageErrors
expr: rate(jaeger_collector_saves_failures_total[5m]) > 10
for: 2m
labels:
severity: warning
annotations:
summary: "Storage save failures in Jaeger"
description: "Jaeger collector has {{ $value }} storage save failures per second"
Install and configure Alertmanager
Set up Alertmanager to handle Prometheus alerts and send notifications to email and Slack.
wget https://github.com/prometheus/alertmanager/releases/download/v0.26.0/alertmanager-0.26.0.linux-amd64.tar.gz
tar xvfz alertmanager-*.tar.gz
sudo cp alertmanager-*/alertmanager /opt/prometheus/
sudo cp alertmanager-*/amtool /opt/prometheus/
sudo chown prometheus:prometheus /opt/prometheus/alertmanager
sudo chown prometheus:prometheus /opt/prometheus/amtool
Configure Alertmanager notifications
Create Alertmanager configuration for email and Slack notifications with routing based on alert severity.
global:
smtp_smarthost: 'smtp.example.com:587'
smtp_from: 'alerts@example.com'
smtp_auth_username: 'alerts@example.com'
smtp_auth_password: 'your-smtp-password'
slack_api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 12h
receiver: 'default-receiver'
routes:
- match:
severity: critical
receiver: 'critical-alerts'
- match:
severity: warning
receiver: 'warning-alerts'
receivers:
- name: 'default-receiver'
email_configs:
- to: 'ops-team@example.com'
subject: 'Jaeger Alert: {{ .GroupLabels.alertname }}'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
{{ end }}
- name: 'critical-alerts'
email_configs:
- to: 'ops-team@example.com'
subject: 'CRITICAL: Jaeger Alert - {{ .GroupLabels.alertname }}'
slack_configs:
- channel: '#alerts'
title: 'Critical Jaeger Alert'
text: '{{ .CommonAnnotations.summary }}'
- name: 'warning-alerts'
slack_configs:
- channel: '#monitoring'
title: 'Jaeger Warning'
text: '{{ .CommonAnnotations.summary }}'
Create systemd services
Set up systemd services for Prometheus and Alertmanager to run automatically on boot.
[Unit]
Description=Prometheus
Wants=network-online.target
After=network-online.target
[Service]
User=prometheus
Group=prometheus
Type=simple
ExecStart=/opt/prometheus/prometheus \
--config.file /etc/prometheus/prometheus.yml \
--storage.tsdb.path /var/lib/prometheus/ \
--web.console.templates=/opt/prometheus/consoles \
--web.console.libraries=/opt/prometheus/console_libraries \
--web.listen-address=0.0.0.0:9090 \
--web.enable-lifecycle
[Install]
WantedBy=multi-user.target
Create Alertmanager systemd service
Configure Alertmanager as a systemd service for automatic startup and management.
[Unit]
Description=Alertmanager
Wants=network-online.target
After=network-online.target
[Service]
User=prometheus
Group=prometheus
Type=simple
ExecStart=/opt/prometheus/alertmanager \
--config.file=/etc/prometheus/alertmanager.yml \
--storage.path=/var/lib/alertmanager/ \
--web.listen-address=0.0.0.0:9093
[Install]
WantedBy=multi-user.target
Install Grafana
Install Grafana for creating dashboards and visualizing Jaeger metrics.
wget -q -O - https://packages.grafana.com/gpg.key | sudo apt-key add -
echo "deb https://packages.grafana.com/oss/deb stable main" | sudo tee /etc/apt/sources.list.d/grafana.list
sudo apt update
sudo apt install -y grafana
Configure Grafana data source
Add Prometheus as a data source in Grafana for querying Jaeger metrics.
sudo mkdir -p /etc/grafana/provisioning/datasources
sudo tee /etc/grafana/provisioning/datasources/prometheus.yml > /dev/null <
Create Jaeger monitoring dashboard
Set up a comprehensive Grafana dashboard for monitoring Jaeger performance metrics.
{
"dashboard": {
"id": null,
"title": "Jaeger Distributed Tracing",
"tags": ["jaeger", "tracing"],
"timezone": "browser",
"panels": [
{
"id": 1,
"title": "Jaeger Services Status",
"type": "stat",
"targets": [
{
"expr": "up{job=~\"jaeger.*\"}",
"legendFormat": "{{ job }}"
}
],
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}
},
{
"id": 2,
"title": "Spans Received Rate",
"type": "graph",
"targets": [
{
"expr": "rate(jaeger_collector_spans_received_total[5m])",
"legendFormat": "Spans/sec"
}
],
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}
},
{
"id": 3,
"title": "Query Latency",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.95, rate(jaeger_query_latency_bucket[5m]))",
"legendFormat": "95th percentile"
},
{
"expr": "histogram_quantile(0.50, rate(jaeger_query_latency_bucket[5m]))",
"legendFormat": "50th percentile"
}
],
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 8}
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"refresh": "30s"
}
}
Start all services
Enable and start Prometheus, Alertmanager, and Grafana services.
sudo chown -R prometheus:prometheus /etc/prometheus
sudo chown -R prometheus:prometheus /var/lib/prometheus
sudo mkdir -p /var/lib/alertmanager
sudo chown -R prometheus:prometheus /var/lib/alertmanager
sudo systemctl daemon-reload
sudo systemctl enable prometheus alertmanager grafana-server
sudo systemctl start prometheus alertmanager grafana-server
Configure firewall rules
Open necessary ports for Prometheus, Alertmanager, and Grafana web interfaces.
sudo ufw allow 9090/tcp comment "Prometheus"
sudo ufw allow 9093/tcp comment "Alertmanager"
sudo ufw allow 3000/tcp comment "Grafana"
sudo ufw reload
Verify your setup
Check that all services are running and accessible.
sudo systemctl status prometheus
sudo systemctl status alertmanager
sudo systemctl status grafana-server
curl http://localhost:9090/targets
curl http://localhost:9093/-/healthy
curl http://localhost:3000/api/health
Configure advanced alerting rules
Add more sophisticated alerting rules for comprehensive Jaeger monitoring.
groups:
- name: jaeger_performance
rules:
- alert: JaegerMemoryUsageHigh
expr: (process_resident_memory_bytes{job=~"jaeger.*"} / 1024 / 1024) > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage in {{ $labels.job }}"
description: "Memory usage is {{ $value }}MB in {{ $labels.job }}"
- alert: JaegerCPUUsageHigh
expr: rate(process_cpu_seconds_total{job=~"jaeger."}[5m]) 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage in {{ $labels.job }}"
description: "CPU usage is {{ $value }}% in {{ $labels.job }}"
- alert: JaegerStorageSpaceUsage
expr: (100 - (node_filesystem_avail_bytes{mountpoint="/"} * 100) / node_filesystem_size_bytes{mountpoint="/"}) > 85
for: 5m
labels:
severity: critical
annotations:
summary: "Low disk space for Jaeger storage"
description: "Disk usage is above 85% on {{ $labels.instance }}"
- name: jaeger_business_metrics
rules:
- alert: JaegerLowTraceIngestionRate
expr: rate(jaeger_collector_spans_received_total[10m]) < 100
for: 10m
labels:
severity: warning
annotations:
summary: "Low trace ingestion rate"
description: "Receiving only {{ $value }} spans per second for 10 minutes"
Common issues
| Symptom | Cause | Fix |
|---|---|---|
| Prometheus can't scrape Jaeger metrics | Jaeger metrics endpoints not exposed | Start Jaeger with --metrics-backend=prometheus |
| Alertmanager not sending notifications | SMTP/Slack configuration incorrect | Check /var/log/alertmanager and test config with amtool |
| Grafana dashboard shows no data | Prometheus data source misconfigured | Verify Prometheus URL in Grafana data sources |
| High memory usage in Prometheus | Too many metrics or long retention | Adjust --storage.tsdb.retention.time and scrape intervals |
| Alerts not firing | Alerting rules syntax errors | Validate rules with promtool check rules |
Next steps
- Integrate Jaeger with Istio service mesh for comprehensive microservices tracing
- Set up advanced Grafana dashboards with custom panels and alerting rules
- Configure Jaeger sampling strategies for high-traffic applications
- Set up Jaeger multi-cluster federation for distributed environments
Running this in production?
Automated install script
Run this to automate the entire setup
#!/usr/bin/env bash
set -euo pipefail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
# Configuration
PROMETHEUS_VERSION="2.47.2"
PROMETHEUS_USER="prometheus"
PROMETHEUS_HOME="/opt/prometheus"
PROMETHEUS_CONFIG="/etc/prometheus"
PROMETHEUS_DATA="/var/lib/prometheus"
# Print colored output
print_status() {
echo -e "${GREEN}[INFO]${NC} $1"
}
print_warning() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# Usage message
usage() {
echo "Usage: $0"
echo "Installs and configures Prometheus with Jaeger monitoring"
exit 1
}
# Cleanup function for rollback
cleanup() {
print_error "Installation failed. Cleaning up..."
systemctl stop prometheus 2>/dev/null || true
systemctl disable prometheus 2>/dev/null || true
rm -f /etc/systemd/system/prometheus.service
rm -rf "$PROMETHEUS_HOME" "$PROMETHEUS_CONFIG" "$PROMETHEUS_DATA"
userdel "$PROMETHEUS_USER" 2>/dev/null || true
rm -f prometheus-*.tar.gz
rm -rf prometheus-*
}
# Set up error trap
trap cleanup ERR
# Check if running as root or with sudo
if [[ $EUID -ne 0 ]]; then
print_error "This script must be run as root or with sudo"
exit 1
fi
# Detect distribution
echo "[1/8] Detecting Linux distribution..."
if [ -f /etc/os-release ]; then
. /etc/os-release
case "$ID" in
ubuntu|debian)
PKG_MGR="apt"
PKG_UPDATE="apt update"
PKG_INSTALL="apt install -y"
;;
almalinux|rocky|centos|rhel|ol|fedora)
PKG_MGR="dnf"
PKG_UPDATE="dnf update -y"
PKG_INSTALL="dnf install -y"
;;
amzn)
PKG_MGR="yum"
PKG_UPDATE="yum update -y"
PKG_INSTALL="yum install -y"
;;
*)
print_error "Unsupported distribution: $ID"
exit 1
;;
esac
else
print_error "Cannot detect Linux distribution"
exit 1
fi
print_status "Detected distribution: $ID using $PKG_MGR"
# Update system packages
echo "[2/8] Updating system packages..."
$PKG_UPDATE
# Install prerequisites
echo "[3/8] Installing prerequisites..."
$PKG_INSTALL wget tar curl
# Download and install Prometheus
echo "[4/8] Downloading and installing Prometheus..."
cd /tmp
wget "https://github.com/prometheus/prometheus/releases/download/v${PROMETHEUS_VERSION}/prometheus-${PROMETHEUS_VERSION}.linux-amd64.tar.gz"
tar xzf "prometheus-${PROMETHEUS_VERSION}.linux-amd64.tar.gz"
# Create directories and user
echo "[5/8] Creating Prometheus user and directories..."
useradd --no-create-home --shell /bin/false "$PROMETHEUS_USER" || true
mkdir -p "$PROMETHEUS_HOME" "$PROMETHEUS_CONFIG" "$PROMETHEUS_DATA"
# Install Prometheus binaries
cp "prometheus-${PROMETHEUS_VERSION}.linux-amd64/prometheus" "$PROMETHEUS_HOME/"
cp "prometheus-${PROMETHEUS_VERSION}.linux-amd64/promtool" "$PROMETHEUS_HOME/"
cp -r "prometheus-${PROMETHEUS_VERSION}.linux-amd64/consoles" "$PROMETHEUS_HOME/"
cp -r "prometheus-${PROMETHEUS_VERSION}.linux-amd64/console_libraries" "$PROMETHEUS_HOME/"
# Set permissions
chown -R "$PROMETHEUS_USER:$PROMETHEUS_USER" "$PROMETHEUS_HOME"
chown -R "$PROMETHEUS_USER:$PROMETHEUS_USER" "$PROMETHEUS_CONFIG"
chown -R "$PROMETHEUS_USER:$PROMETHEUS_USER" "$PROMETHEUS_DATA"
chmod 755 "$PROMETHEUS_HOME/prometheus" "$PROMETHEUS_HOME/promtool"
# Create Prometheus configuration
echo "[6/8] Creating Prometheus configuration..."
cat > "$PROMETHEUS_CONFIG/prometheus.yml" << 'EOF'
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- "jaeger_rules.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
- localhost:9093
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'jaeger-query'
static_configs:
- targets: ['localhost:16687']
metrics_path: /metrics
scrape_interval: 15s
- job_name: 'jaeger-collector'
static_configs:
- targets: ['localhost:14269']
metrics_path: /metrics
scrape_interval: 15s
- job_name: 'jaeger-agent'
static_configs:
- targets: ['localhost:14271']
metrics_path: /metrics
scrape_interval: 15s
- job_name: 'node-exporter'
static_configs:
- targets: ['localhost:9100']
EOF
# Create Jaeger alerting rules
cat > "$PROMETHEUS_CONFIG/jaeger_rules.yml" << 'EOF'
groups:
- name: jaeger_alerts
rules:
- alert: JaegerCollectorDown
expr: up{job="jaeger-collector"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Jaeger collector is down"
description: "Jaeger collector has been down for more than 1 minute"
- alert: JaegerQueryDown
expr: up{job="jaeger-query"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Jaeger query service is down"
description: "Jaeger query service has been down for more than 1 minute"
- alert: JaegerHighSpanDropRate
expr: rate(jaeger_collector_spans_dropped_total[5m]) > 100
for: 2m
labels:
severity: warning
annotations:
summary: "High span drop rate in Jaeger collector"
description: "Jaeger collector is dropping {{ $value }} spans per second"
- alert: JaegerHighTraceLatency
expr: histogram_quantile(0.95, rate(jaeger_query_latency_bucket[5m])) > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "High trace query latency"
description: "95th percentile trace query latency is {{ $value }}ms"
EOF
# Set configuration file permissions
chown "$PROMETHEUS_USER:$PROMETHEUS_USER" "$PROMETHEUS_CONFIG/prometheus.yml" "$PROMETHEUS_CONFIG/jaeger_rules.yml"
chmod 644 "$PROMETHEUS_CONFIG/prometheus.yml" "$PROMETHEUS_CONFIG/jaeger_rules.yml"
# Create systemd service
echo "[7/8] Creating systemd service..."
cat > /etc/systemd/system/prometheus.service << EOF
[Unit]
Description=Prometheus
Wants=network-online.target
After=network-online.target
[Service]
User=$PROMETHEUS_USER
Group=$PROMETHEUS_USER
Type=simple
ExecStart=$PROMETHEUS_HOME/prometheus \\
--config.file=$PROMETHEUS_CONFIG/prometheus.yml \\
--storage.tsdb.path=$PROMETHEUS_DATA/ \\
--web.console.templates=$PROMETHEUS_HOME/consoles \\
--web.console.libraries=$PROMETHEUS_HOME/console_libraries \\
--web.listen-address=0.0.0.0:9090 \\
--web.enable-lifecycle
[Install]
WantedBy=multi-user.target
EOF
# Enable and start Prometheus
systemctl daemon-reload
systemctl enable prometheus
systemctl start prometheus
# Configure firewall if available
if command -v ufw >/dev/null 2>&1; then
ufw allow 9090/tcp 2>/dev/null || true
elif command -v firewall-cmd >/dev/null 2>&1; then
firewall-cmd --permanent --add-port=9090/tcp 2>/dev/null || true
firewall-cmd --reload 2>/dev/null || true
fi
# Cleanup temporary files
rm -rf "/tmp/prometheus-${PROMETHEUS_VERSION}.linux-amd64"*
# Verify installation
echo "[8/8] Verifying installation..."
sleep 5
if systemctl is-active --quiet prometheus; then
print_status "Prometheus service is running"
else
print_error "Prometheus service failed to start"
exit 1
fi
if curl -s http://localhost:9090/-/healthy >/dev/null; then
print_status "Prometheus health check passed"
else
print_warning "Prometheus health check failed - service may still be starting"
fi
# Clear trap
trap - ERR
print_status "Installation completed successfully!"
echo ""
echo "Prometheus is now running and configured for Jaeger monitoring:"
echo " - Web UI: http://$(hostname -I | awk '{print $1}'):9090"
echo " - Configuration: $PROMETHEUS_CONFIG/prometheus.yml"
echo " - Alert rules: $PROMETHEUS_CONFIG/jaeger_rules.yml"
echo " - Data directory: $PROMETHEUS_DATA"
echo ""
echo "Next steps:"
echo " 1. Install and configure Jaeger with metrics enabled"
echo " 2. Install Alertmanager for alert handling"
echo " 3. Install Grafana for visualization"
echo " 4. Configure dashboards for Jaeger metrics"
Review the script before running. Execute with: bash install.sh