Set up comprehensive backup monitoring using Prometheus exporters and Grafana dashboards. Configure automated alerts for backup failures, track success rates, and visualize backup infrastructure health across multiple systems.
Prerequisites
- Existing Prometheus and Grafana installation
- Node Exporter running
- Basic Python 3 environment
- Access to backup log files
What this solves
Backup systems fail silently more often than they should, leaving you vulnerable to data loss without warning. This tutorial shows you how to monitor backup jobs using Prometheus metrics collection and Grafana visualization, with automated alerting when backups fail or miss their schedules.
Step-by-step configuration
Install backup monitoring dependencies
Start by installing the required packages for backup monitoring and metric collection.
sudo apt update
sudo apt install -y python3-pip curl wget jq
pip3 install prometheus_client psutil
Create backup status exporter
Create a Python script that exports backup job metrics to Prometheus format.
#!/usr/bin/env python3
import json
import os
import time
from datetime import datetime, timedelta
from prometheus_client import CollectorRegistry, Gauge, generate_latest, write_to_textfile
import glob
Configuration
BACKUP_LOG_DIR = '/var/log/backups'
METRICS_DIR = '/var/lib/prometheus/node-exporter'
BACKUP_CONFIGS = '/etc/backup-monitor.json'
Prometheus metrics
registry = CollectorRegistry()
backup_success = Gauge('backup_job_success', 'Backup job success status (1=success, 0=failure)', ['job_name', 'backup_type'], registry=registry)
backup_duration = Gauge('backup_job_duration_seconds', 'Backup job duration in seconds', ['job_name', 'backup_type'], registry=registry)
backup_size_bytes = Gauge('backup_size_bytes', 'Backup size in bytes', ['job_name', 'backup_type'], registry=registry)
backup_last_run = Gauge('backup_job_last_run_timestamp', 'Unix timestamp of last backup run', ['job_name', 'backup_type'], registry=registry)
backup_files_count = Gauge('backup_files_count', 'Number of files in backup', ['job_name', 'backup_type'], registry=registry)
def load_backup_config():
"""Load backup job configurations"""
try:
with open(BACKUP_CONFIGS, 'r') as f:
return json.load(f)
except FileNotFoundError:
return {"jobs": []}
def parse_backup_logs():
"""Parse backup log files and extract metrics"""
config = load_backup_config()
for job in config.get('jobs', []):
job_name = job['name']
backup_type = job.get('type', 'unknown')
log_pattern = job.get('log_pattern', f'/var/log/backups/{job_name}*.log')
# Find most recent log file
log_files = glob.glob(log_pattern)
if not log_files:
# No logs found - mark as failed
backup_success.labels(job_name=job_name, backup_type=backup_type).set(0)
continue
latest_log = max(log_files, key=os.path.getctime)
try:
with open(latest_log, 'r') as f:
log_content = f.read()
# Parse log based on backup type
if backup_type == 'mysql':
parse_mysql_backup_log(log_content, job_name, backup_type)
elif backup_type == 'postgres':
parse_postgres_backup_log(log_content, job_name, backup_type)
elif backup_type == 'filesystem':
parse_filesystem_backup_log(log_content, job_name, backup_type)
else:
parse_generic_backup_log(log_content, job_name, backup_type)
except Exception as e:
print(f"Error parsing log for {job_name}: {e}")
backup_success.labels(job_name=job_name, backup_type=backup_type).set(0)
def parse_mysql_backup_log(content, job_name, backup_type):
"""Parse MySQL backup logs"""
lines = content.strip().split('\n')
success = 0
duration = 0
size_bytes = 0
last_run = time.time()
for line in lines:
if 'backup completed successfully' in line.lower():
success = 1
elif 'backup failed' in line.lower() or 'error' in line.lower():
success = 0
elif 'duration:' in line.lower():
try:
duration = float(line.split('duration:')[1].strip().split()[0])
except:
pass
elif 'size:' in line.lower():
try:
size_str = line.split('size:')[1].strip().split()[0]
if 'mb' in size_str.lower():
size_bytes = float(size_str.replace('MB', '').replace('mb', '')) 1024 1024
elif 'gb' in size_str.lower():
size_bytes = float(size_str.replace('GB', '').replace('gb', '')) 1024 1024 * 1024
except:
pass
backup_success.labels(job_name=job_name, backup_type=backup_type).set(success)
backup_duration.labels(job_name=job_name, backup_type=backup_type).set(duration)
backup_size_bytes.labels(job_name=job_name, backup_type=backup_type).set(size_bytes)
backup_last_run.labels(job_name=job_name, backup_type=backup_type).set(last_run)
def parse_postgres_backup_log(content, job_name, backup_type):
"""Parse PostgreSQL backup logs"""
lines = content.strip().split('\n')
success = 0
duration = 0
size_bytes = 0
last_run = time.time()
for line in lines:
if 'pg_dump: last built-in oid is' in line or 'pg_dump: dumping contents of table' in line:
success = 1
elif 'pg_dump: error:' in line or 'connection to database failed' in line:
success = 0
elif 'total time:' in line.lower():
try:
duration = float(line.split(':')[-1].strip().replace('s', ''))
except:
pass
# Get file size if backup file path is in log
for line in lines:
if '.sql' in line and os.path.exists(line.strip()):
try:
size_bytes = os.path.getsize(line.strip())
except:
pass
break
backup_success.labels(job_name=job_name, backup_type=backup_type).set(success)
backup_duration.labels(job_name=job_name, backup_type=backup_type).set(duration)
backup_size_bytes.labels(job_name=job_name, backup_type=backup_type).set(size_bytes)
backup_last_run.labels(job_name=job_name, backup_type=backup_type).set(last_run)
def parse_filesystem_backup_log(content, job_name, backup_type):
"""Parse filesystem backup logs (rsync, tar, etc.)"""
lines = content.strip().split('\n')
success = 0
duration = 0
size_bytes = 0
files_count = 0
last_run = time.time()
for line in lines:
if 'backup completed' in line.lower() or 'sent ' in line and 'bytes' in line:
success = 1
elif 'backup failed' in line.lower() or 'rsync error' in line.lower():
success = 0
elif 'total size is' in line:
try:
size_bytes = int(line.split('total size is')[1].strip().split()[0].replace(',', ''))
except:
pass
elif 'total transferred file size' in line:
try:
size_bytes = int(line.split('total transferred file size')[1].strip().split()[0].replace(',', ''))
except:
pass
elif 'number of files' in line:
try:
files_count = int(line.split('number of files')[1].strip().split()[0].replace(',', ''))
except:
pass
backup_success.labels(job_name=job_name, backup_type=backup_type).set(success)
backup_duration.labels(job_name=job_name, backup_type=backup_type).set(duration)
backup_size_bytes.labels(job_name=job_name, backup_type=backup_type).set(size_bytes)
backup_last_run.labels(job_name=job_name, backup_type=backup_type).set(last_run)
backup_files_count.labels(job_name=job_name, backup_type=backup_type).set(files_count)
def parse_generic_backup_log(content, job_name, backup_type):
"""Parse generic backup logs"""
lines = content.strip().split('\n')
success = 0
last_run = time.time()
# Simple success detection
for line in lines:
if any(word in line.lower() for word in ['success', 'completed', 'finished', 'done']):
success = 1
break
elif any(word in line.lower() for word in ['error', 'failed', 'abort']):
success = 0
break
backup_success.labels(job_name=job_name, backup_type=backup_type).set(success)
backup_last_run.labels(job_name=job_name, backup_type=backup_type).set(last_run)
def main():
"""Main function to collect and export metrics"""
# Create metrics directory if it doesn't exist
os.makedirs(METRICS_DIR, exist_ok=True)
# Parse backup logs and update metrics
parse_backup_logs()
# Write metrics to file for node_exporter
metrics_file = os.path.join(METRICS_DIR, 'backup_metrics.prom')
write_to_textfile(metrics_file, registry)
print(f"Backup metrics exported to {metrics_file}")
if __name__ == '__main__':
main()
Create backup monitoring configuration
Define your backup jobs in a configuration file for the exporter to monitor.
{
"jobs": [
{
"name": "mysql_daily",
"type": "mysql",
"log_pattern": "/var/log/backups/mysql_*.log",
"schedule": "0 2 *",
"retention_days": 30
},
{
"name": "postgres_nightly",
"type": "postgres",
"log_pattern": "/var/log/backups/postgres_*.log",
"schedule": "0 1 *",
"retention_days": 14
},
{
"name": "web_files",
"type": "filesystem",
"log_pattern": "/var/log/backups/web_backup_*.log",
"schedule": "0 3 *",
"retention_days": 7
},
{
"name": "config_backup",
"type": "filesystem",
"log_pattern": "/var/log/backups/config_*.log",
"schedule": "0 4 *",
"retention_days": 90
}
]
}
Make exporter executable and create directories
Set proper permissions and create required directories for the monitoring system.
sudo chmod +x /usr/local/bin/backup-exporter.py
sudo mkdir -p /var/lib/prometheus/node-exporter
sudo mkdir -p /var/log/backups
sudo chown -R prometheus:prometheus /var/lib/prometheus
Configure Node Exporter for backup metrics
Configure Node Exporter to include the textfile collector for backup metrics.
sudo apt install -y prometheus-node-exporter
ARGS="--collector.textfile.directory=/var/lib/prometheus/node-exporter"
sudo systemctl restart prometheus-node-exporter
sudo systemctl enable prometheus-node-exporter
Create systemd timer for metric collection
Set up automated metric collection that runs every 5 minutes to keep data current.
[Unit]
Description=Backup Metrics Exporter
Wants=backup-exporter.timer
[Service]
Type=oneshot
User=prometheus
Group=prometheus
ExecStart=/usr/local/bin/backup-exporter.py
StandardOutput=journal
StandardError=journal
[Install]
WantedBy=multi-user.target
[Unit]
Description=Run backup metrics exporter every 5 minutes
Requires=backup-exporter.service
[Timer]
OnCalendar=*:0/5
Persistent=true
[Install]
WantedBy=timers.target
sudo systemctl daemon-reload
sudo systemctl enable --now backup-exporter.timer
sudo systemctl status backup-exporter.timer
Configure Prometheus to scrape backup metrics
Add backup monitoring to your existing Prometheus configuration.
# Add this job to your existing prometheus.yml
scrape_configs:
- job_name: 'backup-monitoring'
static_configs:
- targets: ['localhost:9100']
scrape_interval: 30s
metrics_path: /metrics
params:
collect[]:
- textfile
sudo systemctl reload prometheus
Create Prometheus alerting rules for backups
Define alert rules to notify when backups fail or miss their scheduled runs.
groups:
- name: backup_monitoring
rules:
- alert: BackupJobFailed
expr: backup_job_success == 0
for: 5m
labels:
severity: critical
service: backup
annotations:
summary: "Backup job {{ $labels.job_name }} failed"
description: "Backup job {{ $labels.job_name }} of type {{ $labels.backup_type }} has failed. Check logs for details."
- alert: BackupJobMissing
expr: time() - backup_job_last_run_timestamp > 86400
for: 30m
labels:
severity: warning
service: backup
annotations:
summary: "Backup job {{ $labels.job_name }} hasn't run in 24 hours"
description: "Backup job {{ $labels.job_name }} last ran {{ $value | humanizeDuration }} ago. Expected daily runs."
- alert: BackupSizeAnomaly
expr: |
(
backup_size_bytes
/
(avg_over_time(backup_size_bytes[7d]) != 0)
) < 0.5 or
(
backup_size_bytes
/
(avg_over_time(backup_size_bytes[7d]) != 0)
) > 2
for: 1h
labels:
severity: warning
service: backup
annotations:
summary: "Backup size anomaly for {{ $labels.job_name }}"
description: "Backup job {{ $labels.job_name }} size is {{ $value | humanizePercentage }} of the 7-day average, indicating potential issues."
- alert: BackupDurationHigh
expr: backup_job_duration_seconds > 7200 # 2 hours
for: 15m
labels:
severity: warning
service: backup
annotations:
summary: "Backup job {{ $labels.job_name }} taking too long"
description: "Backup job {{ $labels.job_name }} has been running for {{ $value | humanizeDuration }}, which is unusually long."
sudo systemctl reload prometheus
Import Grafana dashboard for backup monitoring
Create a comprehensive Grafana dashboard to visualize backup metrics and trends.
{
"dashboard": {
"id": null,
"title": "Backup Monitoring Dashboard",
"tags": ["backup", "monitoring"],
"timezone": "browser",
"panels": [
{
"id": 1,
"title": "Backup Success Rate",
"type": "stat",
"targets": [
{
"expr": "avg(backup_job_success) * 100",
"legendFormat": "Success Rate %"
}
],
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"thresholds": {
"steps": [
{"color": "red", "value": 0},
{"color": "yellow", "value": 80},
{"color": "green", "value": 95}
]
},
"unit": "percent"
}
},
"gridPos": {"h": 8, "w": 6, "x": 0, "y": 0}
},
{
"id": 2,
"title": "Backup Jobs Status",
"type": "table",
"targets": [
{
"expr": "backup_job_success",
"legendFormat": "{{job_name}} ({{backup_type}})",
"format": "table",
"instant": true
}
],
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {"Time": true, "__name__": true, "instance": true, "job": true},
"renameByName": {
"job_name": "Backup Job",
"backup_type": "Type",
"Value": "Status"
}
}
}
],
"fieldConfig": {
"overrides": [
{
"matcher": {"id": "byName", "options": "Status"},
"properties": [
{
"id": "custom.displayMode",
"value": "color-background"
},
{
"id": "thresholds",
"value": {
"steps": [
{"color": "red", "value": 0},
{"color": "green", "value": 1}
]
}
}
]
}
]
},
"gridPos": {"h": 8, "w": 18, "x": 6, "y": 0}
},
{
"id": 3,
"title": "Backup Size Trends",
"type": "timeseries",
"targets": [
{
"expr": "backup_size_bytes",
"legendFormat": "{{job_name}} Size"
}
],
"fieldConfig": {
"defaults": {
"unit": "bytes"
}
},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}
},
{
"id": 4,
"title": "Backup Duration",
"type": "timeseries",
"targets": [
{
"expr": "backup_job_duration_seconds",
"legendFormat": "{{job_name}} Duration"
}
],
"fieldConfig": {
"defaults": {
"unit": "s"
}
},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8}
},
{
"id": 5,
"title": "Last Backup Run",
"type": "table",
"targets": [
{
"expr": "backup_job_last_run_timestamp * 1000",
"legendFormat": "{{job_name}}",
"format": "table",
"instant": true
}
],
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {"Time": true, "__name__": true, "instance": true, "job": true},
"renameByName": {
"job_name": "Backup Job",
"backup_type": "Type",
"Value": "Last Run"
}
}
}
],
"fieldConfig": {
"overrides": [
{
"matcher": {"id": "byName", "options": "Last Run"},
"properties": [
{
"id": "unit",
"value": "dateTimeAsLocal"
}
]
}
]
},
"gridPos": {"h": 6, "w": 24, "x": 0, "y": 16}
}
],
"time": {
"from": "now-24h",
"to": "now"
},
"refresh": "30s"
}
}
# Import dashboard via Grafana API
curl -X POST -H "Content-Type: application/json" \
-H "Authorization: Bearer YOUR_GRAFANA_API_KEY" \
-d @/tmp/backup-dashboard.json \
http://localhost:3000/api/dashboards/db
Configure Alertmanager for backup notifications
Set up email notifications when backup jobs fail or miss their schedules.
global:
smtp_smarthost: 'localhost:587'
smtp_from: 'alerts@example.com'
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'backup-alerts'
routes:
- match:
service: backup
receiver: 'backup-team'
receivers:
- name: 'backup-alerts'
email_configs:
- to: 'admin@example.com'
subject: 'Backup Alert: {{ .GroupLabels.alertname }}'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
Severity: {{ .Labels.severity }}
Job: {{ .Labels.job_name }}
{{ end }}
- name: 'backup-team'
email_configs:
- to: 'backup-team@example.com'
subject: 'BACKUP ISSUE: {{ .GroupLabels.alertname }}'
body: |
Backup monitoring has detected an issue:
{{ range .Alerts }}
Job: {{ .Labels.job_name }}
Type: {{ .Labels.backup_type }}
Issue: {{ .Annotations.summary }}
Details: {{ .Annotations.description }}
Severity: {{ .Labels.severity }}
{{ end }}
Please investigate immediately.
sudo systemctl reload alertmanager
Verify your setup
Test the backup monitoring system to ensure metrics are being collected and displayed correctly.
# Run the backup exporter manually to test
sudo -u prometheus /usr/local/bin/backup-exporter.py
Check that metrics file was created
ls -la /var/lib/prometheus/node-exporter/backup_metrics.prom
View current metrics
cat /var/lib/prometheus/node-exporter/backup_metrics.prom
Check Node Exporter is serving backup metrics
curl -s http://localhost:9100/metrics | grep backup_
Verify Prometheus is scraping the metrics
curl -s "http://localhost:9090/api/v1/query?query=backup_job_success" | jq .
Check alerting rules are loaded
curl -s http://localhost:9090/api/v1/rules | jq '.data.groups[] | select(.name=="backup_monitoring")'
Test timer is running
sudo systemctl status backup-exporter.timer
journalctl -u backup-exporter.service -f
Common issues
| Symptom | Cause | Fix |
|---|---|---|
| No backup metrics in Prometheus | Node Exporter not configured for textfile collector | Add --collector.textfile.directory to Node Exporter args and restart |
| Backup exporter fails with permission denied | Wrong file ownership or permissions | sudo chown prometheus:prometheus /var/lib/prometheus/node-exporter |
| Metrics file empty or not updating | Backup logs not found or unreadable | Check log paths in /etc/backup-monitor.json and ensure logs exist |
| False positive alerts | Log parsing not matching your backup format | Customize parsing functions in backup-exporter.py for your log format |
| Grafana dashboard shows no data | Data source not configured correctly | Verify Prometheus data source in Grafana points to correct URL |
| Email alerts not sending | SMTP configuration incorrect | Test SMTP settings: echo "test" | mail -s "test" admin@example.com |
Next steps
- Configure Prometheus Alertmanager with email notifications for production monitoring
- Set up automated MySQL database backups with compression and rotation
- Configure advanced Grafana dashboards and alerting with Prometheus integration
- Implement backup retention policies with automated cleanup
- Configure distributed backup monitoring across multiple servers
Running this in production?
Automated install script
Run this to automate the entire setup
#!/usr/bin/env bash
set -euo pipefail
# Backup Monitoring with Prometheus and Grafana Install Script
# Supports Ubuntu, Debian, AlmaLinux, Rocky Linux, CentOS, RHEL
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# Configuration
GRAFANA_PORT=${GRAFANA_PORT:-3000}
PROMETHEUS_PORT=${PROMETHEUS_PORT:-9090}
NODE_EXPORTER_PORT=${NODE_EXPORTER_PORT:-9100}
# Print colored output
print_status() {
echo -e "${GREEN}[INFO]${NC} $1"
}
print_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# Usage message
usage() {
echo "Usage: $0 [OPTIONS]"
echo "Options:"
echo " --grafana-port PORT Grafana port (default: 3000)"
echo " --prometheus-port PORT Prometheus port (default: 9090)"
echo " --help Show this help message"
exit 1
}
# Parse arguments
while [[ $# -gt 0 ]]; do
case $1 in
--grafana-port)
GRAFANA_PORT="$2"
shift 2
;;
--prometheus-port)
PROMETHEUS_PORT="$2"
shift 2
;;
--help)
usage
;;
*)
echo "Unknown option: $1"
usage
;;
esac
done
# Cleanup function for rollback
cleanup() {
print_error "Installation failed. Rolling back changes..."
systemctl stop prometheus grafana-server node_exporter backup-monitor.timer 2>/dev/null || true
systemctl disable prometheus grafana-server node_exporter backup-monitor.timer 2>/dev/null || true
exit 1
}
trap cleanup ERR
# Check if running as root or with sudo
if [[ $EUID -eq 0 ]]; then
SUDO=""
else
if ! command -v sudo &> /dev/null; then
print_error "This script requires root privileges or sudo"
exit 1
fi
SUDO="sudo"
fi
# Detect distribution
print_status "Detecting distribution..."
if [ -f /etc/os-release ]; then
. /etc/os-release
case "$ID" in
ubuntu|debian)
PKG_MGR="apt"
PKG_UPDATE="apt update"
PKG_INSTALL="apt install -y"
FIREWALL_CMD="ufw"
;;
almalinux|rocky|centos|rhel|ol|fedora)
PKG_MGR="dnf"
PKG_UPDATE="dnf update -y"
PKG_INSTALL="dnf install -y"
FIREWALL_CMD="firewall-cmd"
;;
amzn)
PKG_MGR="yum"
PKG_UPDATE="yum update -y"
PKG_INSTALL="yum install -y"
FIREWALL_CMD="firewall-cmd"
;;
*)
print_error "Unsupported distribution: $ID"
exit 1
;;
esac
else
print_error "Cannot detect distribution"
exit 1
fi
print_status "Detected distribution: $PRETTY_NAME"
echo "[1/8] Updating system packages..."
$SUDO $PKG_UPDATE
echo "[2/8] Installing dependencies..."
if [[ "$PKG_MGR" == "apt" ]]; then
$SUDO $PKG_INSTALL python3-pip curl wget jq gnupg2 software-properties-common
else
$SUDO $PKG_INSTALL python3-pip curl wget jq
fi
# Install Python packages
$SUDO pip3 install prometheus_client psutil
echo "[3/8] Installing Prometheus..."
PROM_VERSION="2.45.0"
cd /tmp
wget -q "https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/prometheus-${PROM_VERSION}.linux-amd64.tar.gz"
tar xzf "prometheus-${PROM_VERSION}.linux-amd64.tar.gz"
$SUDO useradd --no-create-home --shell /bin/false prometheus 2>/dev/null || true
$SUDO mkdir -p /etc/prometheus /var/lib/prometheus
$SUDO cp "prometheus-${PROM_VERSION}.linux-amd64/prometheus" /usr/local/bin/
$SUDO cp "prometheus-${PROM_VERSION}.linux-amd64/promtool" /usr/local/bin/
$SUDO cp -r "prometheus-${PROM_VERSION}.linux-amd64/consoles" /etc/prometheus/
$SUDO cp -r "prometheus-${PROM_VERSION}.linux-amd64/console_libraries" /etc/prometheus/
$SUDO chown -R prometheus:prometheus /etc/prometheus /var/lib/prometheus
$SUDO chmod 755 /usr/local/bin/prometheus /usr/local/bin/promtool
echo "[4/8] Installing Node Exporter..."
NODE_VERSION="1.6.0"
wget -q "https://github.com/prometheus/node_exporter/releases/download/v${NODE_VERSION}/node_exporter-${NODE_VERSION}.linux-amd64.tar.gz"
tar xzf "node_exporter-${NODE_VERSION}.linux-amd64.tar.gz"
$SUDO cp "node_exporter-${NODE_VERSION}.linux-amd64/node_exporter" /usr/local/bin/
$SUDO useradd --no-create-home --shell /bin/false node_exporter 2>/dev/null || true
$SUDO chown node_exporter:node_exporter /usr/local/bin/node_exporter
echo "[5/8] Installing Grafana..."
if [[ "$PKG_MGR" == "apt" ]]; then
wget -q -O - https://packages.grafana.com/gpg.key | $SUDO apt-key add -
echo "deb https://packages.grafana.com/oss/deb stable main" | $SUDO tee /etc/apt/sources.list.d/grafana.list
$SUDO apt update
$SUDO $PKG_INSTALL grafana
else
$SUDO tee /etc/yum.repos.d/grafana.repo << 'EOF'
[grafana]
name=grafana
baseurl=https://packages.grafana.com/oss/rpm
repo_gpgcheck=1
enabled=1
gpgcheck=1
gpgkey=https://packages.grafana.com/gpg.key
sslverify=1
sslcacert=/etc/pki/tls/certs/ca-bundle.crt
EOF
$SUDO $PKG_INSTALL grafana
fi
echo "[6/8] Creating configuration files..."
# Prometheus config
$SUDO tee /etc/prometheus/prometheus.yml << EOF
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:${PROMETHEUS_PORT}']
- job_name: 'node_exporter'
static_configs:
- targets: ['localhost:${NODE_EXPORTER_PORT}']
- job_name: 'backup_monitor'
static_configs:
- targets: ['localhost:${NODE_EXPORTER_PORT}']
metrics_path: '/metrics'
EOF
# Backup monitor configuration
$SUDO mkdir -p /etc/backup-monitor /var/log/backups /var/lib/prometheus/node-exporter
$SUDO tee /etc/backup-monitor.json << 'EOF'
{
"jobs": [
{
"name": "mysql_daily",
"type": "mysql",
"log_pattern": "/var/log/backups/mysql_*.log"
},
{
"name": "filesystem_daily",
"type": "filesystem",
"log_pattern": "/var/log/backups/filesystem_*.log"
}
]
}
EOF
# Backup monitoring script
$SUDO tee /usr/local/bin/backup_monitor.py << 'EOF'
#!/usr/bin/env python3
import json
import os
import time
import glob
import re
from datetime import datetime
from prometheus_client import CollectorRegistry, Gauge, write_to_textfile
BACKUP_LOG_DIR = '/var/log/backups'
METRICS_FILE = '/var/lib/prometheus/node-exporter/backup_metrics.prom'
BACKUP_CONFIGS = '/etc/backup-monitor.json'
registry = CollectorRegistry()
backup_success = Gauge('backup_job_success', 'Backup job success status', ['job_name', 'backup_type'], registry=registry)
backup_duration = Gauge('backup_job_duration_seconds', 'Backup job duration', ['job_name', 'backup_type'], registry=registry)
backup_size_bytes = Gauge('backup_size_bytes', 'Backup size in bytes', ['job_name', 'backup_type'], registry=registry)
backup_last_run = Gauge('backup_job_last_run_timestamp', 'Last backup run timestamp', ['job_name', 'backup_type'], registry=registry)
def load_config():
try:
with open(BACKUP_CONFIGS, 'r') as f:
return json.load(f)
except FileNotFoundError:
return {"jobs": []}
def parse_logs():
config = load_config()
for job in config.get('jobs', []):
job_name = job['name']
backup_type = job.get('type', 'unknown')
log_pattern = job.get('log_pattern', f'/var/log/backups/{job_name}*.log')
log_files = glob.glob(log_pattern)
if not log_files:
backup_success.labels(job_name=job_name, backup_type=backup_type).set(0)
continue
latest_log = max(log_files, key=os.path.getctime)
try:
with open(latest_log, 'r') as f:
content = f.read()
success = 1 if 'success' in content.lower() and 'error' not in content.lower() else 0
backup_success.labels(job_name=job_name, backup_type=backup_type).set(success)
backup_last_run.labels(job_name=job_name, backup_type=backup_type).set(time.time())
# Extract size and duration from logs if available
size_match = re.search(r'(\d+)\s*bytes?', content, re.IGNORECASE)
if size_match:
backup_size_bytes.labels(job_name=job_name, backup_type=backup_type).set(int(size_match.group(1)))
except Exception as e:
backup_success.labels(job_name=job_name, backup_type=backup_type).set(0)
if __name__ == "__main__":
parse_logs()
write_to_textfile(METRICS_FILE, registry)
EOF
$SUDO chmod 755 /usr/local/bin/backup_monitor.py
$SUDO chown prometheus:prometheus /usr/local/bin/backup_monitor.py
echo "[7/8] Creating systemd services..."
# Prometheus service
$SUDO tee /etc/systemd/system/prometheus.service << EOF
[Unit]
Description=Prometheus
Wants=network-online.target
After=network-online.target
[Service]
User=prometheus
Group=prometheus
Type=simple
ExecStart=/usr/local/bin/prometheus \\
--config.file /etc/prometheus/prometheus.yml \\
--storage.tsdb.path /var/lib/prometheus/ \\
--web.console.templates=/etc/prometheus/consoles \\
--web.console.libraries=/etc/prometheus/console_libraries \\
--web.listen-address=0.0.0.0:${PROMETHEUS_PORT}
[Install]
WantedBy=multi-user.target
EOF
# Node exporter service
$SUDO tee /etc/systemd/system/node_exporter.service << EOF
[Unit]
Description=Node Exporter
Wants=network-online.target
After=network-online.target
[Service]
User=node_exporter
Group=node_exporter
Type=simple
ExecStart=/usr/local/bin/node_exporter --web.listen-address=:${NODE_EXPORTER_PORT} --collector.textfile.directory=/var/lib/prometheus/node-exporter
[Install]
WantedBy=multi-user.target
EOF
# Backup monitor timer
$SUDO tee /etc/systemd/system/backup-monitor.service << 'EOF'
[Unit]
Description=Backup Monitor
After=network.target
[Service]
Type=oneshot
ExecStart=/usr/local/bin/backup_monitor.py
User=prometheus
Group=prometheus
EOF
$SUDO tee /etc/systemd/system/backup-monitor.timer << 'EOF'
[Unit]
Description=Run backup monitor every 5 minutes
Requires=backup-monitor.service
[Timer]
OnCalendar=*:0/5
Persistent=true
[Install]
WantedBy=timers.target
EOF
echo "[8/8] Starting services..."
$SUDO systemctl daemon-reload
$SUDO systemctl enable prometheus node_exporter grafana-server backup-monitor.timer
$SUDO systemctl start prometheus node_exporter grafana-server backup-monitor.timer
# Configure firewall
if command -v ufw >/dev/null 2>&1; then
$SUDO ufw allow ${PROMETHEUS_PORT}/tcp
$SUDO ufw allow ${GRAFANA_PORT}/tcp
$SUDO ufw allow ${NODE_EXPORTER_PORT}/tcp
elif command -v firewall-cmd >/dev/null 2>&1; then
$SUDO firewall-cmd --permanent --add-port=${PROMETHEUS_PORT}/tcp
$SUDO firewall-cmd --permanent --add-port=${GRAFANA_PORT}/tcp
$SUDO firewall-cmd --permanent --add-port=${NODE_EXPORTER_PORT}/tcp
$SUDO firewall-cmd --reload
fi
# Set proper permissions
$SUDO chown -R prometheus:prometheus /etc/prometheus /var/lib/prometheus
$SUDO chown -R grafana:grafana /etc/grafana
$SUDO chmod 755 /var/log/backups /var/lib/prometheus/node-exporter
print_status "Verifying installation..."
sleep 5
# Verification checks
if ! systemctl is-active --quiet prometheus; then
print_error "Prometheus service is not running"
exit 1
fi
if ! systemctl is-active --quiet grafana-server; then
print_error "Grafana service is not running"
exit 1
fi
if ! systemctl is-active --quiet node_exporter; then
print_error "Node Exporter service is not running"
exit 1
fi
if ! curl -s "http://localhost:${PROMETHEUS_PORT}/api/v1/targets" >/dev/null; then
print_error "Prometheus API is not responding"
exit 1
fi
print_status "Installation completed successfully!"
echo ""
echo "Access URLs:"
echo " Prometheus: http://localhost
Review the script before running. Execute with: bash install.sh