Set up comprehensive monitoring for Caddy reverse proxy and Consul service discovery with Prometheus metrics collection and Grafana dashboards for performance insights and alerting.
Prerequisites
- Root access to the server
- Caddy and Consul already installed and running
- Basic familiarity with systemd services
- SMTP server for alert notifications
What this solves
When running Caddy as a reverse proxy with Consul for service discovery, you need visibility into both systems to maintain performance and catch issues early. This tutorial sets up Prometheus exporters for both services and creates Grafana dashboards to monitor proxy metrics, service health, and cluster status in real-time.
Step-by-step configuration
Update system packages
Start by updating your package manager to ensure you get the latest versions of all components.
sudo apt update && sudo apt upgrade -yInstall Prometheus
Download and install Prometheus to collect metrics from Caddy and Consul. We'll create a dedicated user for security.
sudo useradd --no-create-home --shell /bin/false prometheus
sudo mkdir /etc/prometheus
sudo mkdir /var/lib/prometheus
sudo chown prometheus:prometheus /etc/prometheus
sudo chown prometheus:prometheus /var/lib/prometheuscd /tmp
wget https://github.com/prometheus/prometheus/releases/download/v2.45.0/prometheus-2.45.0.linux-amd64.tar.gz
tar xvf prometheus-2.45.0.linux-amd64.tar.gz
sudo cp prometheus-2.45.0.linux-amd64/prometheus /usr/local/bin/
sudo cp prometheus-2.45.0.linux-amd64/promtool /usr/local/bin/
sudo chown prometheus:prometheus /usr/local/bin/prometheus
sudo chown prometheus:prometheus /usr/local/bin/promtoolConfigure Prometheus for Caddy and Consul
Create the main Prometheus configuration file with scrape targets for both Caddy metrics and Consul endpoints.
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- "caddy_rules.yml"
- "consul_rules.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
- localhost:9093
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'caddy'
static_configs:
- targets: ['localhost:2019']
metrics_path: '/metrics'
scrape_interval: 30s
- job_name: 'consul'
static_configs:
- targets: ['localhost:8500']
metrics_path: '/v1/agent/metrics'
params:
format: ['prometheus']
scrape_interval: 30s
- job_name: 'consul-services'
consul_sd_configs:
- server: 'localhost:8500'
services: []
relabel_configs:
- source_labels: [__meta_consul_service]
target_label: service
- source_labels: [__meta_consul_node]
target_label: nodesudo chown prometheus:prometheus /etc/prometheus/prometheus.ymlConfigure Caddy to expose metrics
Enable Caddy's built-in metrics endpoint by adding the admin API configuration to your Caddyfile.
{
admin :2019
servers {
metrics
}
}
example.com {
reverse_proxy consul.service.consul:8080 {
health_uri /health
health_interval 10s
health_timeout 5s
}
log {
output file /var/log/caddy/access.log {
roll_size 100mb
roll_keep 5
}
format json
}
}sudo systemctl reload caddyEnable Consul metrics
Configure Consul to expose Prometheus metrics through its HTTP API by updating the agent configuration.
{
"telemetry": {
"prometheus_retention_time": "30s",
"disable_hostname": false
},
"ports": {
"grpc": 8502
},
"connect": {
"enabled": true
}
}sudo systemctl reload consulCreate alerting rules for Caddy
Define Prometheus alerting rules to monitor Caddy performance and availability issues.
groups:
- name: caddy.rules
rules:
- alert: CaddyDown
expr: up{job="caddy"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Caddy server is down"
description: "Caddy has been down for more than 1 minute"
- alert: CaddyHighRequestLatency
expr: histogram_quantile(0.95, rate(caddy_http_request_duration_seconds_bucket[5m])) > 0.5
for: 2m
labels:
severity: warning
annotations:
summary: "High request latency on Caddy"
description: "95th percentile latency is {{ $value }}s"
- alert: CaddyHighErrorRate
expr: rate(caddy_http_requests_total{status=~"5.."}[5m]) / rate(caddy_http_requests_total[5m]) > 0.05
for: 2m
labels:
severity: critical
annotations:
summary: "High error rate on Caddy"
description: "Error rate is {{ $value | humanizePercentage }}"
- alert: CaddyUpstreamDown
expr: caddy_reverse_proxy_upstreams_healthy == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Caddy upstream is down"
description: "No healthy upstreams available for {{ $labels.upstream }}"
- alert: CaddyHighMemoryUsage
expr: process_resident_memory_bytes{job="caddy"} / 1024 / 1024 > 500
for: 5m
labels:
severity: warning
annotations:
summary: "Caddy high memory usage"
description: "Caddy memory usage is {{ $value }}MB"Create alerting rules for Consul
Set up Consul-specific alerts for cluster health, service discovery issues, and performance problems.
groups:
- name: consul.rules
rules:
- alert: ConsulDown
expr: up{job="consul"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Consul agent is down"
description: "Consul agent has been down for more than 1 minute"
- alert: ConsulLeaderMissing
expr: consul_raft_leader == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Consul cluster has no leader"
description: "Consul cluster is without a leader"
- alert: ConsulHighMemoryUsage
expr: consul_runtime_alloc_bytes / 1024 / 1024 > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "Consul high memory usage"
description: "Consul memory usage is {{ $value }}MB"
- alert: ConsulServiceUnhealthy
expr: consul_health_service_query_count{status!="passing"} > 0
for: 2m
labels:
severity: warning
annotations:
summary: "Consul service health check failing"
description: "Service {{ $labels.service }} health check is failing"
- alert: ConsulNodeUnhealthy
expr: consul_health_node_query_count{status!="passing"} > 0
for: 2m
labels:
severity: critical
annotations:
summary: "Consul node health check failing"
description: "Node {{ $labels.node }} health check is failing"
- alert: ConsulRaftLogGrowth
expr: increase(consul_raft_commitIndex[1h]) > 10000
for: 5m
labels:
severity: warning
annotations:
summary: "High Consul raft log growth"
description: "Raft log has grown by {{ $value }} entries in the last hour"Set correct permissions for Prometheus files
Ensure Prometheus can read all configuration files by setting appropriate ownership and permissions.
sudo chown prometheus:prometheus /etc/prometheus/caddy_rules.yml
sudo chown prometheus:prometheus /etc/prometheus/consul_rules.yml
sudo chmod 644 /etc/prometheus/*.ymlCreate Prometheus systemd service
Set up Prometheus as a systemd service for automatic startup and management.
[Unit]
Description=Prometheus
Wants=network-online.target
After=network-online.target
[Service]
User=prometheus
Group=prometheus
Type=simple
ExecStart=/usr/local/bin/prometheus \
--config.file /etc/prometheus/prometheus.yml \
--storage.tsdb.path /var/lib/prometheus/ \
--web.console.templates=/etc/prometheus/consoles \
--web.console.libraries=/etc/prometheus/console_libraries \
--web.listen-address=0.0.0.0:9090 \
--web.enable-lifecycle \
--storage.tsdb.retention.time=30d
[Install]
WantedBy=multi-user.targetsudo systemctl daemon-reload
sudo systemctl enable --now prometheusInstall Grafana
Add the Grafana repository and install it to create dashboards for monitoring data visualization.
sudo apt install -y software-properties-common wget
wget -q -O - https://packages.grafana.com/gpg.key | sudo apt-key add -
echo "deb https://packages.grafana.com/oss/deb stable main" | sudo tee /etc/apt/sources.list.d/grafana.list
sudo apt update
sudo apt install -y grafanaConfigure Grafana data source
Create a Grafana configuration to automatically provision the Prometheus data source.
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://localhost:9090
isDefault: true
editable: trueCreate Caddy dashboard configuration
Set up a comprehensive Grafana dashboard to monitor Caddy reverse proxy metrics and performance.
{
"dashboard": {
"id": null,
"title": "Caddy Reverse Proxy Monitoring",
"tags": ["caddy", "proxy"],
"timezone": "browser",
"panels": [
{
"id": 1,
"title": "HTTP Requests per Second",
"type": "graph",
"targets": [
{
"expr": "rate(caddy_http_requests_total[5m])",
"legendFormat": "{{method}} {{host}}"
}
],
"yAxes": [
{
"label": "requests/sec"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
}
},
{
"id": 2,
"title": "Response Time Percentiles",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.50, rate(caddy_http_request_duration_seconds_bucket[5m]))",
"legendFormat": "50th percentile"
},
{
"expr": "histogram_quantile(0.95, rate(caddy_http_request_duration_seconds_bucket[5m]))",
"legendFormat": "95th percentile"
},
{
"expr": "histogram_quantile(0.99, rate(caddy_http_request_duration_seconds_bucket[5m]))",
"legendFormat": "99th percentile"
}
],
"yAxes": [
{
"label": "seconds"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
}
},
{
"id": 3,
"title": "HTTP Status Codes",
"type": "graph",
"targets": [
{
"expr": "rate(caddy_http_requests_total{status=~\"2..\"}[5m])",
"legendFormat": "2xx Success"
},
{
"expr": "rate(caddy_http_requests_total{status=~\"4..\"}[5m])",
"legendFormat": "4xx Client Error"
},
{
"expr": "rate(caddy_http_requests_total{status=~\"5..\"}[5m])",
"legendFormat": "5xx Server Error"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 8
}
},
{
"id": 4,
"title": "Upstream Health Status",
"type": "stat",
"targets": [
{
"expr": "caddy_reverse_proxy_upstreams_healthy",
"legendFormat": "{{upstream}}"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 8
}
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"refresh": "30s"
}
}Create Consul dashboard configuration
Build a Grafana dashboard specifically for monitoring Consul cluster health and service discovery metrics.
{
"dashboard": {
"id": null,
"title": "Consul Cluster Monitoring",
"tags": ["consul", "service-discovery"],
"timezone": "browser",
"panels": [
{
"id": 1,
"title": "Consul Nodes Status",
"type": "stat",
"targets": [
{
"expr": "consul_serf_lan_members",
"legendFormat": "Active Nodes"
}
],
"gridPos": {
"h": 4,
"w": 6,
"x": 0,
"y": 0
}
},
{
"id": 2,
"title": "Raft Leader Status",
"type": "stat",
"targets": [
{
"expr": "consul_raft_leader",
"legendFormat": "Has Leader"
}
],
"gridPos": {
"h": 4,
"w": 6,
"x": 6,
"y": 0
}
},
{
"id": 3,
"title": "Service Health Checks",
"type": "graph",
"targets": [
{
"expr": "consul_health_service_query_count{status=\"passing\"}",
"legendFormat": "Passing"
},
{
"expr": "consul_health_service_query_count{status=\"warning\"}",
"legendFormat": "Warning"
},
{
"expr": "consul_health_service_query_count{status=\"critical\"}",
"legendFormat": "Critical"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 4
}
},
{
"id": 4,
"title": "Memory Usage",
"type": "graph",
"targets": [
{
"expr": "consul_runtime_alloc_bytes / 1024 / 1024",
"legendFormat": "Allocated Memory (MB)"
},
{
"expr": "consul_runtime_sys_bytes / 1024 / 1024",
"legendFormat": "System Memory (MB)"
}
],
"yAxes": [
{
"label": "MB"
}
],
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 4
}
},
{
"id": 5,
"title": "Raft Transactions",
"type": "graph",
"targets": [
{
"expr": "rate(consul_raft_apply[5m])",
"legendFormat": "Apply Rate"
},
{
"expr": "consul_raft_commitIndex",
"legendFormat": "Commit Index"
}
],
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 12
}
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"refresh": "30s"
}
}Create dashboard provisioning configuration
Set up Grafana to automatically load the dashboard configurations on startup.
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: ''
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /etc/grafana/provisioning/dashboardsInstall and configure Alertmanager
Set up Alertmanager to handle alerts from Prometheus and send notifications for critical issues.
cd /tmp
wget https://github.com/prometheus/alertmanager/releases/download/v0.26.0/alertmanager-0.26.0.linux-amd64.tar.gz
tar xvf alertmanager-0.26.0.linux-amd64.tar.gz
sudo cp alertmanager-0.26.0.linux-amd64/alertmanager /usr/local/bin/
sudo cp alertmanager-0.26.0.linux-amd64/amtool /usr/local/bin/
sudo useradd --no-create-home --shell /bin/false alertmanager
sudo mkdir /etc/alertmanager
sudo mkdir /var/lib/alertmanager
sudo chown alertmanager:alertmanager /etc/alertmanager
sudo chown alertmanager:alertmanager /var/lib/alertmanager
sudo chown alertmanager:alertmanager /usr/local/bin/alertmanager
sudo chown alertmanager:alertmanager /usr/local/bin/amtoolConfigure Alertmanager notifications
Set up Alertmanager to send email notifications for critical alerts from both Caddy and Consul monitoring.
global:
smtp_smarthost: 'smtp.example.com:587'
smtp_from: 'alerts@example.com'
smtp_auth_username: 'alerts@example.com'
smtp_auth_password: 'your_smtp_password'
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'web.hook'
receivers:
- name: 'web.hook'
email_configs:
- to: 'admin@example.com'
subject: '[ALERT] {{ .GroupLabels.alertname }}'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
Labels: {{ .Labels }}
{{ end }}
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']sudo chown alertmanager:alertmanager /etc/alertmanager/alertmanager.ymlCreate Alertmanager systemd service
Set up Alertmanager as a systemd service for automatic startup and process management.
[Unit]
Description=Alertmanager
Wants=network-online.target
After=network-online.target
[Service]
User=alertmanager
Group=alertmanager
Type=simple
ExecStart=/usr/local/bin/alertmanager \
--config.file /etc/alertmanager/alertmanager.yml \
--storage.path /var/lib/alertmanager/ \
--web.listen-address=0.0.0.0:9093 \
--web.external-url=http://localhost:9093
[Install]
WantedBy=multi-user.targetsudo systemctl daemon-reload
sudo systemctl enable --now alertmanagerStart and enable Grafana
Enable Grafana to start on boot and start the service to begin monitoring dashboard access.
sudo systemctl enable --now grafana-serverConfigure firewall rules
Open necessary ports for accessing Prometheus, Grafana, and Alertmanager web interfaces securely.
sudo ufw allow 9090/tcp comment 'Prometheus'
sudo ufw allow 3000/tcp comment 'Grafana'
sudo ufw allow 9093/tcp comment 'Alertmanager'
sudo ufw allow 2019/tcp comment 'Caddy Admin'Verify your setup
Check that all components are running and collecting metrics properly before proceeding to dashboard configuration.
# Check service status
sudo systemctl status prometheus grafana-server alertmanager
Verify Prometheus is scraping targets
curl -s http://localhost:9090/api/v1/targets | jq '.data.activeTargets[].health'
Test Caddy metrics endpoint
curl -s http://localhost:2019/metrics | grep caddy_http_requests_total
Test Consul metrics endpoint
curl -s http://localhost:8500/v1/agent/metrics?format=prometheus | grep consul_
Check Grafana is accessible
curl -I http://localhost:3000Access Grafana at http://your_server_ip:3000 with default credentials admin/admin. The dashboards should automatically load and display metrics from both Caddy and Consul. You can find more details on securing web servers in our Caddy SSL certificates tutorial.
Common issues
| Symptom | Cause | Fix |
|---|---|---|
| Prometheus shows targets as down | Services not exposing metrics | Check service configs and restart: sudo systemctl restart caddy consul |
| No data in Grafana dashboards | Data source not configured | Verify Prometheus data source: curl -s http://localhost:9090/api/v1/query?query=up |
| Caddy metrics endpoint 404 | Admin API not enabled | Add admin :2019 to Caddyfile global block and reload |
| Consul metrics return empty | Telemetry not configured | Add telemetry config to /etc/consul.d/metrics.json and restart |
| Alertmanager not sending emails | SMTP configuration incorrect | Test with: /usr/local/bin/amtool config check /etc/alertmanager/alertmanager.yml |
| Dashboard panels show no data | Metric names changed | Check available metrics: curl -s http://localhost:9090/api/v1/label/__name__/values |
Next steps
- Set up Alertmanager webhook integrations for Slack and Microsoft Teams notifications
- Configure advanced Grafana dashboards with custom panels and alerting rules
- Monitor external endpoints with Blackbox Exporter for comprehensive service monitoring
- Set up multi-datacenter Consul monitoring for distributed infrastructure
- Implement advanced Caddy performance monitoring with custom metrics and log analysis
Running this in production?
Automated install script
Run this to automate the entire setup
#!/usr/bin/env bash
set -euo pipefail
# Caddy + Consul + Prometheus + Grafana Monitoring Setup Script
# Production-grade installer with multi-distro support
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
# Global variables
PROMETHEUS_VERSION="2.45.0"
GRAFANA_VERSION="10.2.0"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# Usage function
usage() {
echo "Usage: $0 [domain] [consul_datacenter]"
echo "Example: $0 monitoring.example.com dc1"
exit 1
}
# Logging functions
log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
# Cleanup function for rollback
cleanup() {
log_error "Installation failed! Rolling back..."
systemctl stop prometheus grafana-server 2>/dev/null || true
userdel prometheus 2>/dev/null || true
rm -rf /etc/prometheus /var/lib/prometheus /usr/local/bin/prometheus /usr/local/bin/promtool
exit 1
}
trap cleanup ERR
# Prerequisites check
check_prerequisites() {
echo "[1/12] Checking prerequisites..."
if [[ $EUID -ne 0 ]]; then
log_error "This script must be run as root"
exit 1
fi
if ! command -v wget &> /dev/null; then
log_error "wget is required but not installed"
exit 1
fi
}
# Detect distribution
detect_distro() {
echo "[2/12] Detecting distribution..."
if [ -f /etc/os-release ]; then
. /etc/os-release
case "$ID" in
ubuntu|debian)
PKG_MGR="apt"
PKG_INSTALL="apt install -y"
PKG_UPDATE="apt update && apt upgrade -y"
FIREWALL_CMD="ufw allow"
;;
almalinux|rocky|centos|rhel|ol|fedora)
PKG_MGR="dnf"
PKG_INSTALL="dnf install -y"
PKG_UPDATE="dnf update -y"
FIREWALL_CMD="firewall-cmd --permanent --add-port"
;;
amzn)
PKG_MGR="yum"
PKG_INSTALL="yum install -y"
PKG_UPDATE="yum update -y"
FIREWALL_CMD="firewall-cmd --permanent --add-port"
;;
*)
log_error "Unsupported distribution: $ID"
exit 1
;;
esac
log_info "Detected: $PRETTY_NAME"
else
log_error "Cannot detect distribution - /etc/os-release not found"
exit 1
fi
}
# Update system packages
update_system() {
echo "[3/12] Updating system packages..."
$PKG_UPDATE
}
# Install dependencies
install_dependencies() {
echo "[4/12] Installing dependencies..."
case "$PKG_MGR" in
apt)
$PKG_INSTALL curl wget tar adduser software-properties-common
;;
dnf|yum)
$PKG_INSTALL curl wget tar
;;
esac
}
# Create prometheus user
create_prometheus_user() {
echo "[5/12] Creating prometheus user and directories..."
useradd --no-create-home --shell /bin/false prometheus || true
mkdir -p /etc/prometheus /var/lib/prometheus
chown prometheus:prometheus /etc/prometheus /var/lib/prometheus
chmod 755 /etc/prometheus /var/lib/prometheus
}
# Install Prometheus
install_prometheus() {
echo "[6/12] Installing Prometheus..."
cd /tmp
wget -q "https://github.com/prometheus/prometheus/releases/download/v${PROMETHEUS_VERSION}/prometheus-${PROMETHEUS_VERSION}.linux-amd64.tar.gz"
tar xzf "prometheus-${PROMETHEUS_VERSION}.linux-amd64.tar.gz"
cp "prometheus-${PROMETHEUS_VERSION}.linux-amd64/prometheus" /usr/local/bin/
cp "prometheus-${PROMETHEUS_VERSION}.linux-amd64/promtool" /usr/local/bin/
chown prometheus:prometheus /usr/local/bin/prometheus /usr/local/bin/promtool
chmod 755 /usr/local/bin/prometheus /usr/local/bin/promtool
rm -rf "prometheus-${PROMETHEUS_VERSION}.linux-amd64"*
}
# Configure Prometheus
configure_prometheus() {
echo "[7/12] Configuring Prometheus..."
cat > /etc/prometheus/prometheus.yml << 'EOF'
global:
scrape_interval: 15s
evaluation_interval: 15s
rule_files:
- "caddy_rules.yml"
- "consul_rules.yml"
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'caddy'
static_configs:
- targets: ['localhost:2019']
metrics_path: '/metrics'
scrape_interval: 30s
- job_name: 'consul'
static_configs:
- targets: ['localhost:8500']
metrics_path: '/v1/agent/metrics'
params:
format: ['prometheus']
scrape_interval: 30s
- job_name: 'consul-services'
consul_sd_configs:
- server: 'localhost:8500'
services: []
relabel_configs:
- source_labels: [__meta_consul_service]
target_label: service
- source_labels: [__meta_consul_node]
target_label: node
EOF
chown prometheus:prometheus /etc/prometheus/prometheus.yml
chmod 644 /etc/prometheus/prometheus.yml
}
# Create alerting rules
create_alerting_rules() {
echo "[8/12] Creating alerting rules..."
cat > /etc/prometheus/caddy_rules.yml << 'EOF'
groups:
- name: caddy.rules
rules:
- alert: CaddyDown
expr: up{job="caddy"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Caddy server is down"
description: "Caddy has been down for more than 1 minute"
- alert: CaddyHighRequestLatency
expr: histogram_quantile(0.95, rate(caddy_http_request_duration_seconds_bucket[5m])) > 0.5
for: 2m
labels:
severity: warning
annotations:
summary: "High request latency on Caddy"
description: "95th percentile latency is {{ $value }}s"
EOF
cat > /etc/prometheus/consul_rules.yml << 'EOF'
groups:
- name: consul.rules
rules:
- alert: ConsulDown
expr: up{job="consul"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Consul agent is down"
description: "Consul agent has been down for more than 1 minute"
- alert: ConsulServiceUnhealthy
expr: consul_health_service_status{status!="passing"} > 0
for: 2m
labels:
severity: warning
annotations:
summary: "Consul service unhealthy"
description: "Service {{ $labels.service_name }} is in {{ $labels.status }} state"
EOF
chown prometheus:prometheus /etc/prometheus/*.yml
chmod 644 /etc/prometheus/*.yml
}
# Create systemd service for Prometheus
create_prometheus_service() {
echo "[9/12] Creating Prometheus systemd service..."
cat > /etc/systemd/system/prometheus.service << 'EOF'
[Unit]
Description=Prometheus Server
Documentation=https://prometheus.io/docs/
After=network.target
[Service]
User=prometheus
Group=prometheus
Type=simple
Restart=on-failure
ExecStart=/usr/local/bin/prometheus \
--config.file=/etc/prometheus/prometheus.yml \
--storage.tsdb.path=/var/lib/prometheus/ \
--web.console.templates=/etc/prometheus/consoles \
--web.console.libraries=/etc/prometheus/console_libraries \
--web.listen-address=0.0.0.0:9090 \
--web.enable-lifecycle \
--storage.tsdb.retention.time=30d
[Install]
WantedBy=multi-user.target
EOF
chmod 644 /etc/systemd/system/prometheus.service
systemctl daemon-reload
systemctl enable prometheus
}
# Install Grafana
install_grafana() {
echo "[10/12] Installing Grafana..."
case "$PKG_MGR" in
apt)
wget -q -O - https://packages.grafana.com/gpg.key | apt-key add -
echo "deb https://packages.grafana.com/oss/deb stable main" > /etc/apt/sources.list.d/grafana.list
apt update
$PKG_INSTALL grafana
;;
dnf)
cat > /etc/yum.repos.d/grafana.repo << 'EOF'
[grafana]
name=grafana
baseurl=https://packages.grafana.com/oss/rpm
repo_gpgcheck=1
enabled=1
gpgcheck=1
gpgkey=https://packages.grafana.com/gpg.key
EOF
$PKG_INSTALL grafana
;;
yum)
cat > /etc/yum.repos.d/grafana.repo << 'EOF'
[grafana]
name=grafana
baseurl=https://packages.grafana.com/oss/rpm
repo_gpgcheck=1
enabled=1
gpgcheck=1
gpgkey=https://packages.grafana.com/gpg.key
EOF
$PKG_INSTALL grafana
;;
esac
systemctl enable grafana-server
}
# Configure firewall
configure_firewall() {
echo "[11/12] Configuring firewall..."
case "$PKG_MGR" in
apt)
if command -v ufw &> /dev/null; then
ufw allow 9090/tcp # Prometheus
ufw allow 3000/tcp # Grafana
fi
;;
dnf|yum)
if command -v firewall-cmd &> /dev/null; then
firewall-cmd --permanent --add-port=9090/tcp
firewall-cmd --permanent --add-port=3000/tcp
firewall-cmd --reload
fi
;;
esac
}
# Start services and verify
start_and_verify() {
echo "[12/12] Starting services and verifying installation..."
systemctl start prometheus
systemctl start grafana-server
# Wait for services to start
sleep 10
# Verify Prometheus
if curl -s http://localhost:9090/-/ready | grep -q "Prometheus is Ready"; then
log_info "Prometheus is running successfully"
else
log_error "Prometheus failed to start properly"
exit 1
fi
# Verify Grafana
if curl -s http://localhost:3000/api/health | grep -q "ok"; then
log_info "Grafana is running successfully"
else
log_error "Grafana failed to start properly"
exit 1
fi
}
# Main execution
main() {
DOMAIN=${1:-"localhost"}
CONSUL_DC=${2:-"dc1"}
check_prerequisites
detect_distro
update_system
install_dependencies
create_prometheus_user
install_prometheus
configure_prometheus
create_alerting_rules
create_prometheus_service
install_grafana
configure_firewall
start_and_verify
log_info "Installation completed successfully!"
log_info "Prometheus: http://localhost:9090"
log_info "Grafana: http://localhost:3000 (admin/admin)"
log_warn "Remember to configure Caddy admin API on :2019 and Consul telemetry"
}
# Run main function
main "$@"
Review the script before running. Execute with: bash install.sh