Set up hierarchical Prometheus federation to monitor multiple Kubernetes clusters with a central aggregation layer. Configure global and local Prometheus instances with federated scrape jobs, service discovery, and unified dashboards for enterprise-scale observability.
Prerequisites
- Multiple servers or clusters to monitor
- Basic knowledge of Prometheus configuration
- Network connectivity between federation instances
- Understanding of PromQL for custom metrics queries
What this solves
Prometheus federation enables you to create a hierarchical monitoring architecture where multiple Prometheus servers collect metrics from different clusters or environments, while a global Prometheus server aggregates key metrics for centralized monitoring. This approach reduces network overhead, improves scalability, and provides both local and global views of your infrastructure.
Step-by-step configuration
Update system packages
Start by updating your package manager to ensure you get the latest versions of all components.
sudo apt update && sudo apt upgrade -y
sudo apt install -y wget curl tar
Create Prometheus user and directories
Create dedicated users and directory structure for both global and local Prometheus instances.
sudo useradd --no-create-home --shell /bin/false prometheus
sudo useradd --no-create-home --shell /bin/false prometheus-local
sudo mkdir -p /etc/prometheus/{global,local}
sudo mkdir -p /var/lib/prometheus/{global,local}
sudo mkdir -p /etc/systemd/system
sudo chown prometheus:prometheus /etc/prometheus/global /var/lib/prometheus/global
sudo chown prometheus-local:prometheus-local /etc/prometheus/local /var/lib/prometheus/local
Download and install Prometheus
Download the latest Prometheus binary and install it for both global and local instances.
cd /tmp
wget https://github.com/prometheus/prometheus/releases/download/v2.48.0/prometheus-2.48.0.linux-amd64.tar.gz
tar xzf prometheus-2.48.0.linux-amd64.tar.gz
sudo cp prometheus-2.48.0.linux-amd64/prometheus /usr/local/bin/
sudo cp prometheus-2.48.0.linux-amd64/promtool /usr/local/bin/
sudo cp -r prometheus-2.48.0.linux-amd64/consoles /etc/prometheus/
sudo cp -r prometheus-2.48.0.linux-amd64/console_libraries /etc/prometheus/
sudo chown -R prometheus:prometheus /etc/prometheus/consoles /etc/prometheus/console_libraries
sudo chown prometheus:prometheus /usr/local/bin/prometheus /usr/local/bin/promtool
Configure local Prometheus instance
Create configuration for the local Prometheus server that will collect metrics from local services and be scraped by the global instance.
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: 'local-cluster-1'
region: 'us-east-1'
environment: 'production'
rule_files:
- "local_rules.yml"
scrape_configs:
- job_name: 'prometheus-local'
static_configs:
- targets: ['localhost:9091']
labels:
instance: 'local-prometheus'
- job_name: 'node-exporter'
static_configs:
- targets: ['localhost:9100']
labels:
cluster: 'local-cluster-1'
- job_name: 'kubernetes-cluster-1'
kubernetes_sd_configs:
- role: pod
kubeconfig_file: '/etc/kubernetes/admin.conf'
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- job_name: 'application-metrics'
static_configs:
- targets: ['app1.example.com:8080', 'app2.example.com:8080']
labels:
environment: 'production'
cluster: 'local-cluster-1'
alerting:
alertmanagers:
- static_configs:
- targets:
- localhost:9093
Create local Prometheus alerting rules
Define alerting rules for the local cluster that will be evaluated before metrics are federated to the global instance.
groups:
- name: local_cluster_alerts
rules:
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
cluster: local-cluster-1
annotations:
summary: "High CPU usage on {{ $labels.instance }}"
description: "CPU usage is above 80% on {{ $labels.instance }} for more than 5 minutes."
- alert: MemoryUsageHigh
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 90
for: 5m
labels:
severity: critical
cluster: local-cluster-1
annotations:
summary: "High memory usage on {{ $labels.instance }}"
description: "Memory usage is above 90% on {{ $labels.instance }}."
- alert: DiskSpaceLow
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 < 10
for: 5m
labels:
severity: critical
cluster: local-cluster-1
annotations:
summary: "Low disk space on {{ $labels.instance }}"
description: "Disk space is below 10% on {{ $labels.instance }}."
- alert: ServiceDown
expr: up == 0
for: 1m
labels:
severity: critical
cluster: local-cluster-1
annotations:
summary: "Service {{ $labels.job }} is down"
description: "Service {{ $labels.job }} on {{ $labels.instance }} has been down for more than 1 minute."
Configure global Prometheus instance
Create configuration for the global Prometheus server that will federate metrics from multiple local Prometheus instances.
global:
scrape_interval: 30s
evaluation_interval: 30s
external_labels:
monitor: 'global-prometheus'
datacenter: 'main'
rule_files:
- "global_rules.yml"
scrape_configs:
- job_name: 'prometheus-global'
static_configs:
- targets: ['localhost:9090']
- job_name: 'federated-clusters'
scrape_interval: 30s
honor_labels: true
metrics_path: '/federate'
params:
'match[]':
# Aggregate job-level metrics
- '{__name__=~"job:.*"}'
# Include important instance-level metrics
- '{__name__=~"up|node_cpu_seconds_total|node_memory_MemTotal_bytes|node_memory_MemAvailable_bytes|node_filesystem_size_bytes|node_filesystem_avail_bytes"}'
# Application-specific metrics
- '{__name__=~"http_requests_total|http_request_duration_seconds.|application_."}'
# Kubernetes metrics
- '{__name__=~"kube_.|container_."}'
# Custom business metrics
- '{__name__=~"business_.|revenue_.|user_.*"}'
static_configs:
- targets:
- 'prometheus-local-1.example.com:9091'
- 'prometheus-local-2.example.com:9091'
- 'prometheus-local-3.example.com:9091'
labels:
region: 'us-east-1'
- targets:
- 'prometheus-eu-1.example.com:9091'
- 'prometheus-eu-2.example.com:9091'
labels:
region: 'eu-west-1'
- targets:
- 'prometheus-asia-1.example.com:9091'
labels:
region: 'ap-southeast-1'
- job_name: 'cross-cluster-services'
static_configs:
- targets: ['gateway.example.com:9090']
labels:
service: 'api-gateway'
type: 'cross-cluster'
alerting:
alertmanagers:
- static_configs:
- targets:
- 'alertmanager-global:9093'
Create global aggregation rules
Define recording and alerting rules for the global Prometheus instance that aggregate metrics across all federated clusters.
groups:
- name: global_aggregation_rules
interval: 30s
rules:
# Cross-cluster CPU usage aggregation
- record: job:cpu_usage_percent:mean5m
expr: 100 - (avg by(job, cluster) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
- record: cluster:cpu_usage_percent:mean5m
expr: avg by(cluster) (job:cpu_usage_percent:mean5m)
- record: global:cpu_usage_percent:mean5m
expr: avg(cluster:cpu_usage_percent:mean5m)
# Cross-cluster memory usage aggregation
- record: job:memory_usage_percent:current
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100
- record: cluster:memory_usage_percent:current
expr: avg by(cluster) (job:memory_usage_percent:current)
# HTTP request rate aggregation
- record: job:http_requests:rate5m
expr: sum by(job, cluster) (rate(http_requests_total[5m]))
- record: cluster:http_requests:rate5m
expr: sum by(cluster) (job:http_requests:rate5m)
- record: global:http_requests:rate5m
expr: sum(cluster:http_requests:rate5m)
# Service availability aggregation
- record: job:up:avg
expr: avg by(job, cluster) (up)
- record: cluster:services_available:count
expr: count by(cluster) (job:up:avg > 0.8)
- name: global_alerts
rules:
- alert: ClusterDown
expr: up{job="federated-clusters"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Prometheus cluster {{ $labels.cluster }} is unreachable"
description: "Cannot scrape metrics from cluster {{ $labels.cluster }} in region {{ $labels.region }}."
- alert: GlobalHighCPUUsage
expr: global:cpu_usage_percent:mean5m > 75
for: 10m
labels:
severity: warning
annotations:
summary: "Global CPU usage is high"
description: "Average CPU usage across all clusters is {{ $value }}%."
- alert: ClusterHighErrorRate
expr: cluster:http_requests:rate5m > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "High request rate in cluster {{ $labels.cluster }}"
description: "Cluster {{ $labels.cluster }} is receiving {{ $value }} requests/sec."
- alert: ServiceUnavailableAcrossClusters
expr: count by(job) (job:up:avg < 0.5) > 2
for: 3m
labels:
severity: critical
annotations:
summary: "Service {{ $labels.job }} is failing across multiple clusters"
description: "Service {{ $labels.job }} has low availability in multiple clusters."
Create systemd service files
Create separate systemd service files for the local and global Prometheus instances to run them concurrently on different ports.
[Unit]
Description=Prometheus Local Instance
Wants=network-online.target
After=network-online.target
[Service]
User=prometheus-local
Group=prometheus-local
Type=simple
ExecStart=/usr/local/bin/prometheus \
--config.file /etc/prometheus/local/prometheus.yml \
--storage.tsdb.path /var/lib/prometheus/local/ \
--web.console.templates=/etc/prometheus/consoles \
--web.console.libraries=/etc/prometheus/console_libraries \
--web.listen-address=0.0.0.0:9091 \
--storage.tsdb.retention.time=7d \
--web.enable-lifecycle
Restart=always
RestartSec=3
[Install]
WantedBy=multi-user.target
Create global Prometheus service
Create the systemd service file for the global Prometheus instance that will aggregate federated metrics.
[Unit]
Description=Prometheus Global Federation Instance
Wants=network-online.target
After=network-online.target
[Service]
User=prometheus
Group=prometheus
Type=simple
ExecStart=/usr/local/bin/prometheus \
--config.file /etc/prometheus/global/prometheus.yml \
--storage.tsdb.path /var/lib/prometheus/global/ \
--web.console.templates=/etc/prometheus/consoles \
--web.console.libraries=/etc/prometheus/console_libraries \
--web.listen-address=0.0.0.0:9090 \
--storage.tsdb.retention.time=90d \
--web.enable-lifecycle \
--web.enable-admin-api
Restart=always
RestartSec=3
[Install]
WantedBy=multi-user.target
Configure firewall rules
Open the necessary ports for Prometheus federation and web interfaces.
sudo ufw allow 9090/tcp comment 'Prometheus Global'
sudo ufw allow 9091/tcp comment 'Prometheus Local'
sudo ufw allow from 203.0.113.0/24 to any port 9090 comment 'Global Prometheus Access'
sudo ufw allow from 198.51.100.0/24 to any port 9091 comment 'Federation Scraping'
sudo ufw reload
Set correct file permissions
Ensure all configuration files have the correct ownership and permissions for security.
sudo chown -R prometheus:prometheus /etc/prometheus/global/
sudo chown -R prometheus-local:prometheus-local /etc/prometheus/local/
sudo chmod 644 /etc/prometheus/global/*.yml
sudo chmod 644 /etc/prometheus/local/*.yml
sudo chmod 755 /var/lib/prometheus/global
sudo chmod 755 /var/lib/prometheus/local
Start and enable services
Start both Prometheus instances and enable them to start automatically on boot.
sudo systemctl daemon-reload
sudo systemctl enable prometheus-local prometheus-global
sudo systemctl start prometheus-local
sudo systemctl start prometheus-global
Check service status
sudo systemctl status prometheus-local
sudo systemctl status prometheus-global
Install Node Exporter for system metrics
Install Node Exporter to provide system metrics that can be federated to the global instance.
cd /tmp
wget https://github.com/prometheus/node_exporter/releases/download/v1.7.0/node_exporter-1.7.0.linux-amd64.tar.gz
tar xzf node_exporter-1.7.0.linux-amd64.tar.gz
sudo cp node_exporter-1.7.0.linux-amd64/node_exporter /usr/local/bin/
sudo chown prometheus:prometheus /usr/local/bin/node_exporter
Create Node Exporter service
Create a systemd service for Node Exporter to collect system metrics automatically.
[Unit]
Description=Node Exporter
Wants=network-online.target
After=network-online.target
[Service]
User=prometheus
Group=prometheus
Type=simple
ExecStart=/usr/local/bin/node_exporter \
--web.listen-address=0.0.0.0:9100 \
--collector.filesystem.mount-points-exclude='^/(sys|proc|dev|host|etc)($$|/)' \
--collector.textfile.directory=/var/lib/node_exporter/textfile_collector
Restart=always
RestartSec=3
[Install]
WantedBy=multi-user.target
Start Node Exporter
Enable and start the Node Exporter service to begin collecting system metrics.
sudo mkdir -p /var/lib/node_exporter/textfile_collector
sudo chown -R prometheus:prometheus /var/lib/node_exporter
sudo systemctl daemon-reload
sudo systemctl enable --now node_exporter
sudo systemctl status node_exporter
Configure service discovery for multiple clusters
Create an additional configuration file for dynamic service discovery across multiple data centers or cloud environments. This approach scales better than static configurations as your infrastructure grows.
[
{
"targets": [
"prometheus-cluster-1.example.com:9091",
"prometheus-cluster-2.example.com:9091"
],
"labels": {
"region": "us-east-1",
"environment": "production",
"cluster_type": "kubernetes"
}
},
{
"targets": [
"prometheus-eu-1.example.com:9091",
"prometheus-eu-2.example.com:9091"
],
"labels": {
"region": "eu-west-1",
"environment": "production",
"cluster_type": "kubernetes"
}
},
{
"targets": [
"prometheus-staging.example.com:9091"
],
"labels": {
"region": "us-east-1",
"environment": "staging",
"cluster_type": "docker-swarm"
}
}
]
Verify your setup
Test that both Prometheus instances are running and that federation is working correctly.
# Check service status
sudo systemctl status prometheus-local prometheus-global node_exporter
Test local Prometheus web interface
curl -s http://localhost:9091/api/v1/label/__name__/values | jq '.data[] | select(. | startswith("node_"))'
Test global Prometheus web interface
curl -s http://localhost:9090/api/v1/label/__name__/values | jq '.data[] | select(. | startswith("up"))'
Test federation endpoint
curl -s 'http://localhost:9091/federate?match[]={__name__=~"up|node_.*"}' | head -20
Check that global Prometheus is scraping local instance
curl -s 'http://localhost:9090/api/v1/query?query=up{job="federated-clusters"}' | jq '.data.result[].value[1]'
Verify aggregated metrics are being created
curl -s 'http://localhost:9090/api/v1/query?query=global:cpu_usage_percent:mean5m' | jq '.data.result[]'
Test configuration validity
sudo -u prometheus /usr/local/bin/promtool check config /etc/prometheus/global/prometheus.yml
sudo -u prometheus-local /usr/local/bin/promtool check config /etc/prometheus/local/prometheus.yml
Check log files for any errors
sudo journalctl -u prometheus-global --no-pager -l
sudo journalctl -u prometheus-local --no-pager -l
Common issues
| Symptom | Cause | Fix |
|---|---|---|
| Federation not working | Firewall blocking connections | sudo ufw allow from federation_network to any port 9091 |
| No metrics in global instance | match[] parameters too restrictive | Add more patterns to match[] in federate job |
| High memory usage on global instance | Too many metrics being federated | Refine match[] filters to federate only essential metrics |
| Permission denied errors | Incorrect file ownership | sudo chown -R prometheus:prometheus /etc/prometheus/global |
| Service fails to start | Port already in use | sudo netstat -tlnp | grep :9090 and change port if needed |
| External labels not appearing | honor_labels: true overriding | Set honor_labels: false or use different label names |
| Recording rules not working | Syntax errors in rules file | promtool check rules /etc/prometheus/global/global_rules.yml |
| Cross-cluster queries failing | Time synchronization issues | Configure NTP: sudo timedatectl set-ntp true |
Next steps
- Set up Thanos Receiver for remote write scalability with Prometheus integration
- Monitor Kubernetes clusters with Prometheus and Grafana for container orchestration insights
- Configure Prometheus long-term storage with Thanos for unlimited data retention
- Implement Prometheus Federation with Alertmanager clustering for high availability alerting
- Set up Prometheus federation with Consul service discovery for dynamic target management
Automated install script
Run this to automate the entire setup
#!/usr/bin/env bash
set -euo pipefail
# Colors
readonly RED='\033[0;31m'
readonly GREEN='\033[0;32m'
readonly YELLOW='\033[1;33m'
readonly BLUE='\033[0;34m'
readonly NC='\033[0m'
# Variables
readonly PROMETHEUS_VERSION="2.48.0"
readonly PROMETHEUS_USER="prometheus"
readonly PROMETHEUS_LOCAL_USER="prometheus-local"
print_status() {
echo -e "${BLUE}[INFO]${NC} $1"
}
print_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
print_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
cleanup() {
print_error "Installation failed. Cleaning up..."
systemctl stop prometheus-global prometheus-local 2>/dev/null || true
systemctl disable prometheus-global prometheus-local 2>/dev/null || true
rm -f /etc/systemd/system/prometheus-global.service /etc/systemd/system/prometheus-local.service
userdel -r ${PROMETHEUS_USER} 2>/dev/null || true
userdel -r ${PROMETHEUS_LOCAL_USER} 2>/dev/null || true
rm -rf /etc/prometheus /var/lib/prometheus /usr/local/bin/prometheus /usr/local/bin/promtool
}
trap cleanup ERR
usage() {
cat << EOF
Usage: $0 [OPTIONS]
Options:
-h, --help Show this help message
-l, --local-port Local Prometheus port (default: 9091)
-g, --global-port Global Prometheus port (default: 9090)
-c, --cluster-name Cluster name for external labels (default: local-cluster-1)
-r, --region Region for external labels (default: us-east-1)
Example:
$0 -c production-cluster -r us-west-2
EOF
}
check_prerequisites() {
print_status "[1/10] Checking prerequisites..."
if [[ $EUID -ne 0 ]]; then
print_error "This script must be run as root"
exit 1
fi
if ! command -v systemctl &> /dev/null; then
print_error "systemd is required but not found"
exit 1
fi
print_success "Prerequisites check passed"
}
detect_distro() {
print_status "[2/10] Detecting distribution..."
if [[ ! -f /etc/os-release ]]; then
print_error "Cannot detect distribution: /etc/os-release not found"
exit 1
fi
. /etc/os-release
case "$ID" in
ubuntu|debian)
PKG_MGR="apt"
PKG_UPDATE="apt update -y"
PKG_INSTALL="apt install -y"
FIREWALL_CMD="ufw"
;;
almalinux|rocky|centos|rhel|ol)
PKG_MGR="dnf"
PKG_UPDATE="dnf update -y"
PKG_INSTALL="dnf install -y"
FIREWALL_CMD="firewall-cmd"
;;
fedora)
PKG_MGR="dnf"
PKG_UPDATE="dnf update -y"
PKG_INSTALL="dnf install -y"
FIREWALL_CMD="firewall-cmd"
;;
amzn)
PKG_MGR="yum"
PKG_UPDATE="yum update -y"
PKG_INSTALL="yum install -y"
FIREWALL_CMD="firewall-cmd"
;;
*)
print_error "Unsupported distribution: $ID"
exit 1
;;
esac
print_success "Detected distribution: $ID (Package manager: $PKG_MGR)"
}
install_dependencies() {
print_status "[3/10] Installing dependencies..."
$PKG_UPDATE
$PKG_INSTALL wget curl tar
print_success "Dependencies installed"
}
create_users_and_directories() {
print_status "[4/10] Creating users and directories..."
useradd --no-create-home --shell /bin/false ${PROMETHEUS_USER} 2>/dev/null || true
useradd --no-create-home --shell /bin/false ${PROMETHEUS_LOCAL_USER} 2>/dev/null || true
mkdir -p /etc/prometheus/{global,local}
mkdir -p /var/lib/prometheus/{global,local}
mkdir -p /etc/systemd/system
chown ${PROMETHEUS_USER}:${PROMETHEUS_USER} /etc/prometheus/global /var/lib/prometheus/global
chown ${PROMETHEUS_LOCAL_USER}:${PROMETHEUS_LOCAL_USER} /etc/prometheus/local /var/lib/prometheus/local
print_success "Users and directories created"
}
download_and_install_prometheus() {
print_status "[5/10] Downloading and installing Prometheus..."
cd /tmp
wget -q https://github.com/prometheus/prometheus/releases/download/v${PROMETHEUS_VERSION}/prometheus-${PROMETHEUS_VERSION}.linux-amd64.tar.gz
tar xzf prometheus-${PROMETHEUS_VERSION}.linux-amd64.tar.gz
cp prometheus-${PROMETHEUS_VERSION}.linux-amd64/prometheus /usr/local/bin/
cp prometheus-${PROMETHEUS_VERSION}.linux-amd64/promtool /usr/local/bin/
cp -r prometheus-${PROMETHEUS_VERSION}.linux-amd64/consoles /etc/prometheus/
cp -r prometheus-${PROMETHEUS_VERSION}.linux-amd64/console_libraries /etc/prometheus/
chown -R ${PROMETHEUS_USER}:${PROMETHEUS_USER} /etc/prometheus/consoles /etc/prometheus/console_libraries
chown ${PROMETHEUS_USER}:${PROMETHEUS_USER} /usr/local/bin/prometheus /usr/local/bin/promtool
chmod 755 /usr/local/bin/prometheus /usr/local/bin/promtool
rm -rf /tmp/prometheus-${PROMETHEUS_VERSION}*
print_success "Prometheus binaries installed"
}
configure_local_prometheus() {
print_status "[6/10] Configuring local Prometheus instance..."
cat > /etc/prometheus/local/prometheus.yml << 'EOF'
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: 'local-cluster-1'
region: 'us-east-1'
environment: 'production'
rule_files:
- "local_rules.yml"
scrape_configs:
- job_name: 'prometheus-local'
static_configs:
- targets: ['localhost:9091']
labels:
instance: 'local-prometheus'
- job_name: 'node-exporter'
static_configs:
- targets: ['localhost:9100']
labels:
cluster: 'local-cluster-1'
alerting:
alertmanagers:
- static_configs:
- targets:
- localhost:9093
EOF
cat > /etc/prometheus/local/local_rules.yml << 'EOF'
groups:
- name: local_cluster_alerts
rules:
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
cluster: local-cluster-1
annotations:
summary: "High CPU usage on {{ $labels.instance }}"
description: "CPU usage is above 80% for 5 minutes"
EOF
chown -R ${PROMETHEUS_LOCAL_USER}:${PROMETHEUS_LOCAL_USER} /etc/prometheus/local/
chmod 644 /etc/prometheus/local/*.yml
print_success "Local Prometheus configuration created"
}
configure_global_prometheus() {
print_status "[7/10] Configuring global Prometheus instance..."
cat > /etc/prometheus/global/prometheus.yml << 'EOF'
global:
scrape_interval: 30s
evaluation_interval: 30s
external_labels:
monitor: 'global-prometheus'
rule_files:
- "global_rules.yml"
scrape_configs:
- job_name: 'federated'
scrape_interval: 15s
honor_labels: true
metrics_path: '/federate'
params:
'match[]':
- '{job=~"prometheus-.*"}'
- '{__name__=~"node_.*"}'
- '{__name__=~"up"}'
static_configs:
- targets:
- 'localhost:9091'
- job_name: 'prometheus-global'
static_configs:
- targets: ['localhost:9090']
alerting:
alertmanagers:
- static_configs:
- targets:
- localhost:9093
EOF
cat > /etc/prometheus/global/global_rules.yml << 'EOF'
groups:
- name: global_alerts
rules:
- alert: PrometheusDown
expr: up{job=~"prometheus-.*"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Prometheus instance {{ $labels.instance }} is down"
description: "Prometheus instance {{ $labels.instance }} has been down for more than 5 minutes"
EOF
chown -R ${PROMETHEUS_USER}:${PROMETHEUS_USER} /etc/prometheus/global/
chmod 644 /etc/prometheus/global/*.yml
print_success "Global Prometheus configuration created"
}
create_systemd_services() {
print_status "[8/10] Creating systemd services..."
cat > /etc/systemd/system/prometheus-local.service << EOF
[Unit]
Description=Prometheus Local
Wants=network-online.target
After=network-online.target
[Service]
User=${PROMETHEUS_LOCAL_USER}
Group=${PROMETHEUS_LOCAL_USER}
Type=simple
ExecStart=/usr/local/bin/prometheus \\
--config.file /etc/prometheus/local/prometheus.yml \\
--storage.tsdb.path /var/lib/prometheus/local/ \\
--web.console.templates=/etc/prometheus/consoles \\
--web.console.libraries=/etc/prometheus/console_libraries \\
--web.listen-address=0.0.0.0:9091
[Install]
WantedBy=multi-user.target
EOF
cat > /etc/systemd/system/prometheus-global.service << EOF
[Unit]
Description=Prometheus Global
Wants=network-online.target
After=network-online.target
[Service]
User=${PROMETHEUS_USER}
Group=${PROMETHEUS_USER}
Type=simple
ExecStart=/usr/local/bin/prometheus \\
--config.file /etc/prometheus/global/prometheus.yml \\
--storage.tsdb.path /var/lib/prometheus/global/ \\
--web.console.templates=/etc/prometheus/consoles \\
--web.console.libraries=/etc/prometheus/console_libraries \\
--web.listen-address=0.0.0.0:9090
[Install]
WantedBy=multi-user.target
EOF
systemctl daemon-reload
systemctl enable prometheus-local prometheus-global
print_success "Systemd services created and enabled"
}
configure_firewall() {
print_status "[9/10] Configuring firewall..."
case "$PKG_MGR" in
apt)
if command -v ufw &> /dev/null; then
ufw allow 9090/tcp comment "Prometheus Global"
ufw allow 9091/tcp comment "Prometheus Local"
fi
;;
dnf|yum)
if command -v firewall-cmd &> /dev/null && systemctl is-active --quiet firewalld; then
firewall-cmd --permanent --add-port=9090/tcp
firewall-cmd --permanent --add-port=9091/tcp
firewall-cmd --reload
fi
;;
esac
print_success "Firewall configured"
}
start_services_and_verify() {
print_status "[10/10] Starting services and verifying installation..."
systemctl start prometheus-local prometheus-global
sleep 5
if ! systemctl is-active --quiet prometheus-local; then
print_error "Local Prometheus failed to start"
exit 1
fi
if ! systemctl is-active --quiet prometheus-global; then
print_error "Global Prometheus failed to start"
exit 1
fi
local_health=$(curl -s http://localhost:9091/-/healthy || echo "fail")
global_health=$(curl -s http://localhost:9090/-/healthy || echo "fail")
if [[ "$local_health" == "Prometheus is Healthy." ]] && [[ "$global_health" == "Prometheus is Healthy." ]]; then
print_success "All services are running and healthy"
echo -e "\n${GREEN}Installation completed successfully!${NC}"
echo -e "Local Prometheus: http://localhost:9091"
echo -e "Global Prometheus: http://localhost:9090"
echo -e "Federation URL: http://localhost:9091/federate"
else
print_error "Health check failed"
exit 1
fi
}
main() {
local local_port=9091
local global_port=9090
local cluster_name="local-cluster-1"
local region="us-east-1"
while [[ $# -gt 0 ]]; do
case $1 in
-h|--help)
usage
exit 0
;;
-l|--local-port)
local_port="$2"
shift 2
;;
-g|--global-port)
global_port="$2"
shift 2
;;
-c|--cluster-name)
cluster_name="$2"
shift 2
;;
-r|--region)
region="$2"
shift 2
;;
*)
print_error "Unknown option: $1"
usage
exit 1
;;
esac
done
check_prerequisites
detect_distro
install_dependencies
create_users_and_directories
download_and_install_prometheus
configure_local_prometheus
configure_global_prometheus
create_systemd_services
configure_firewall
start_services_and_verify
}
main "$@"
Review the script before running. Execute with: bash install.sh