Deploy a complete monitoring stack using Docker Compose with Prometheus for metrics collection and Grafana for visualization, specifically configured to track AI model performance metrics like inference latency, throughput, and resource utilization.
Prerequisites
- Root or sudo access
- Docker and Docker Compose support
- 4GB+ RAM recommended
- Network access for container images
What this solves
AI model deployments require specialized monitoring to track inference performance, resource utilization, and model accuracy metrics. This tutorial sets up a production-ready monitoring stack using Docker Compose that combines Prometheus for metrics collection with Grafana for visualization, pre-configured with dashboards optimized for machine learning workloads.
Step-by-step installation
Install Docker and Docker Compose
First, install Docker and Docker Compose on your system to run the containerized monitoring stack.
sudo apt update
sudo apt install -y ca-certificates curl gnupg lsb-release
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt update
sudo apt install -y docker-ce docker-ce-cli containerd.io docker-compose-plugin
Start and enable Docker service:
sudo systemctl enable --now docker
sudo usermod -aG docker $USER
newgrp docker
Create monitoring directory structure
Set up the directory structure for your monitoring stack with proper permissions for data persistence.
mkdir -p ~/ai-monitoring/{prometheus,grafana/{data,provisioning/{datasources,dashboards}},alertmanager}
cd ~/ai-monitoring
chmod 755 ~/ai-monitoring
sudo chown -R 472:472 grafana/data
Configure Prometheus for AI metrics
Create the main Prometheus configuration file optimized for AI model monitoring with appropriate scrape intervals and retention settings.
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
monitor: 'ai-model-monitor'
rule_files:
- "rules/*.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'ai-model-metrics'
scrape_interval: 5s
metrics_path: '/metrics'
static_configs:
- targets: ['host.docker.internal:8000']
scrape_timeout: 10s
honor_labels: true
- job_name: 'node-exporter'
static_configs:
- targets: ['node-exporter:9100']
- job_name: 'cadvisor'
static_configs:
- targets: ['cadvisor:8080']
- job_name: 'nvidia-gpu'
static_configs:
- targets: ['nvidia-gpu-exporter:9835']
scrape_interval: 10s
Create AI model alerting rules
Define alerting rules specifically for AI model performance monitoring to catch inference latency issues and resource bottlenecks.
mkdir -p prometheus/rules
groups:
- name: ai_model_performance
rules:
- alert: HighInferenceLatency
expr: histogram_quantile(0.95, rate(model_inference_duration_seconds_bucket[5m])) > 2.0
for: 2m
labels:
severity: warning
annotations:
summary: "High inference latency detected"
description: "95th percentile inference latency is {{ $value }}s for model {{ $labels.model_name }}"
- alert: LowModelThroughput
expr: rate(model_predictions_total[5m]) < 10
for: 3m
labels:
severity: warning
annotations:
summary: "Low model throughput"
description: "Model {{ $labels.model_name }} throughput is {{ $value }} predictions/sec"
- alert: HighGPUMemoryUsage
expr: (nvidia_ml_py_memory_used_bytes / nvidia_ml_py_memory_total_bytes) * 100 > 90
for: 5m
labels:
severity: critical
annotations:
summary: "High GPU memory usage"
description: "GPU {{ $labels.gpu }} memory usage is {{ $value }}%"
- alert: ModelPredictionErrors
expr: rate(model_prediction_errors_total[5m]) > 0.1
for: 2m
labels:
severity: critical
annotations:
summary: "High model prediction error rate"
description: "Model {{ $labels.model_name }} error rate is {{ $value }} errors/sec"
Configure Grafana datasource
Set up Grafana to automatically connect to Prometheus as a datasource for AI metrics visualization.
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
jsonData:
timeInterval: 5s
queryTimeout: 60s
editable: false
Create AI model performance dashboard
Configure a comprehensive Grafana dashboard specifically designed for monitoring AI model performance metrics.
apiVersion: 1
providers:
- name: 'AI Model Dashboards'
orgId: 1
folder: ''
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /etc/grafana/provisioning/dashboards
{
"dashboard": {
"id": null,
"title": "AI Model Performance",
"tags": ["ai", "ml", "performance"],
"timezone": "browser",
"panels": [
{
"id": 1,
"title": "Model Inference Latency (95th percentile)",
"type": "stat",
"targets": [
{
"expr": "histogram_quantile(0.95, rate(model_inference_duration_seconds_bucket[5m]))",
"legendFormat": "{{model_name}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "s",
"thresholds": {
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 1},
{"color": "red", "value": 2}
]
}
}
},
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}
},
{
"id": 2,
"title": "Predictions per Second",
"type": "stat",
"targets": [
{
"expr": "rate(model_predictions_total[5m])",
"legendFormat": "{{model_name}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "reqps",
"thresholds": {
"steps": [
{"color": "red", "value": null},
{"color": "yellow", "value": 10},
{"color": "green", "value": 50}
]
}
}
},
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}
},
{
"id": 3,
"title": "GPU Memory Usage",
"type": "timeseries",
"targets": [
{
"expr": "(nvidia_ml_py_memory_used_bytes / nvidia_ml_py_memory_total_bytes) * 100",
"legendFormat": "GPU {{gpu}}"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"max": 100
}
},
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 8}
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"refresh": "5s"
}
}
Configure Alertmanager
Set up Alertmanager to handle notifications for AI model performance alerts with email and webhook integrations.
global:
smtp_smarthost: 'localhost:587'
smtp_from: 'alerts@example.com'
smtp_auth_username: 'alerts@example.com'
smtp_auth_password: 'your-email-password'
route:
group_by: ['alertname', 'cluster', 'service']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'ai-team'
routes:
- match:
severity: critical
receiver: 'ai-team-critical'
receivers:
- name: 'ai-team'
email_configs:
- to: 'ai-team@example.com'
subject: 'AI Model Alert: {{ .GroupLabels.alertname }}'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
{{ end }}
- name: 'ai-team-critical'
email_configs:
- to: 'ai-team@example.com'
subject: 'CRITICAL: AI Model Issue - {{ .GroupLabels.alertname }}'
body: |
CRITICAL ALERT
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
{{ end }}
webhook_configs:
- url: 'http://your-webhook-url/alerts'
send_resolved: true
Create Docker Compose configuration
Define the complete monitoring stack with all services configured for AI model monitoring including GPU metrics collection.
version: '3.8'
services:
prometheus:
image: prom/prometheus:v2.47.0
container_name: ai-prometheus
ports:
- "9090:9090"
volumes:
- ./prometheus:/etc/prometheus
- prometheus-data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--storage.tsdb.retention.time=15d'
- '--web.enable-lifecycle'
- '--web.enable-admin-api'
networks:
- ai-monitoring
restart: unless-stopped
grafana:
image: grafana/grafana:10.1.0
container_name: ai-grafana
ports:
- "3000:3000"
volumes:
- ./grafana/data:/var/lib/grafana
- ./grafana/provisioning:/etc/grafana/provisioning
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin123
- GF_USERS_ALLOW_SIGN_UP=false
- GF_INSTALL_PLUGINS=grafana-piechart-panel,grafana-clock-panel
networks:
- ai-monitoring
restart: unless-stopped
depends_on:
- prometheus
alertmanager:
image: prom/alertmanager:v0.26.0
container_name: ai-alertmanager
ports:
- "9093:9093"
volumes:
- ./alertmanager:/etc/alertmanager
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
- '--web.external-url=http://localhost:9093'
networks:
- ai-monitoring
restart: unless-stopped
node-exporter:
image: prom/node-exporter:v1.6.1
container_name: ai-node-exporter
ports:
- "9100:9100"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
networks:
- ai-monitoring
restart: unless-stopped
cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.47.0
container_name: ai-cadvisor
ports:
- "8080:8080"
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
privileged: true
devices:
- /dev/kmsg
networks:
- ai-monitoring
restart: unless-stopped
nvidia-gpu-exporter:
image: utkuozdemir/nvidia_gpu_exporter:1.2.0
container_name: ai-gpu-exporter
ports:
- "9835:9835"
volumes:
- /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1:/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1:ro
devices:
- /dev/nvidiactl
- /dev/nvidia0
environment:
- NVIDIA_VISIBLE_DEVICES=all
networks:
- ai-monitoring
restart: unless-stopped
volumes:
prometheus-data:
grafana-data:
networks:
ai-monitoring:
driver: bridge
Deploy the monitoring stack
Start all services using Docker Compose and verify they're running correctly.
cd ~/ai-monitoring
docker compose up -d
docker compose ps
Configure sample AI model metrics endpoint
Create a sample Python script that exposes AI model metrics in Prometheus format for testing the monitoring setup.
#!/usr/bin/env python3
from prometheus_client import start_http_server, Summary, Counter, Histogram, Gauge
import time
import random
Create metrics
model_inference_duration = Histogram('model_inference_duration_seconds',
'Time spent on model inference',
['model_name', 'model_version'])
model_predictions_total = Counter('model_predictions_total',
'Total number of predictions made',
['model_name', 'model_version'])
model_prediction_errors_total = Counter('model_prediction_errors_total',
'Total prediction errors',
['model_name', 'error_type'])
model_accuracy = Gauge('model_accuracy', 'Current model accuracy', ['model_name'])
model_memory_usage = Gauge('model_memory_usage_bytes', 'Memory usage by model', ['model_name'])
def simulate_inference():
"""Simulate AI model inference with metrics"""
model_name = random.choice(['bert-classifier', 'resnet-50', 'gpt-3.5'])
model_version = 'v1.0'
# Simulate inference latency
latency = random.uniform(0.1, 3.0)
with model_inference_duration.labels(model_name=model_name, model_version=model_version).time():
time.sleep(latency)
# Increment prediction counter
model_predictions_total.labels(model_name=model_name, model_version=model_version).inc()
# Simulate occasional errors
if random.random() < 0.05:
error_type = random.choice(['timeout', 'memory_error', 'validation_error'])
model_prediction_errors_total.labels(model_name=model_name, error_type=error_type).inc()
# Update model accuracy
accuracy = random.uniform(0.85, 0.95)
model_accuracy.labels(model_name=model_name).set(accuracy)
# Update memory usage
memory_usage = random.randint(500_000_000, 2_000_000_000) # 500MB to 2GB
model_memory_usage.labels(model_name=model_name).set(memory_usage)
if __name__ == '__main__':
start_http_server(8000)
print("AI Model metrics server started on port 8000")
while True:
simulate_inference()
time.sleep(1)
Make the script executable and run it to generate sample metrics:
chmod +x sample-ai-metrics.py
pip3 install prometheus_client
python3 sample-ai-metrics.py &
Verify your setup
Check that all services are running and accessible:
docker compose ps
curl -f http://localhost:9090/targets
curl -f http://localhost:3000/api/health
curl -f http://localhost:8000/metrics | head -20
Access the monitoring interfaces:
- Prometheus: http://localhost:9090
- Grafana: http://localhost:3000 (admin/admin123)
- Alertmanager: http://localhost:9093
You can build on this foundation by learning about advanced alerting strategies and monitoring additional services.
Common issues
| Symptom | Cause | Fix |
|---|---|---|
| Grafana data directory permission denied | Incorrect ownership of grafana/data | sudo chown -R 472:472 grafana/data |
| nvidia-gpu-exporter fails to start | No NVIDIA GPUs or missing drivers | Remove GPU exporter from docker-compose.yml or install NVIDIA drivers |
| Prometheus can't scrape AI model metrics | host.docker.internal not resolving | Replace with your server's IP address in prometheus.yml |
| Alerts not firing | Alerting rules syntax error | docker exec ai-prometheus promtool check rules /etc/prometheus/rules/*.yml |
| Dashboard not loading data | Prometheus datasource misconfigured | Verify datasource URL is http://prometheus:9090 in Grafana |
Next steps
- Set up long-term storage with Thanos for historical AI model performance data
- Extend monitoring to Kubernetes for containerized AI workloads
- Create custom exporters for your specific AI frameworks
- Add distributed tracing for AI microservices architecture
Running this in production?
Automated install script
Run this to automate the entire setup
#!/usr/bin/env bash
set -euo pipefail
# AI Monitoring Stack Installation Script
# Sets up Docker Compose monitoring with Prometheus and Grafana for AI model performance tracking
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Configuration
INSTALL_DIR="$HOME/ai-monitoring"
DOCKER_COMPOSE_VERSION="v2.20.2"
# Usage
usage() {
echo "Usage: $0 [OPTIONS]"
echo "Options:"
echo " -d, --directory DIR Installation directory (default: $HOME/ai-monitoring)"
echo " -h, --help Show this help message"
exit 1
}
# Parse arguments
while [[ $# -gt 0 ]]; do
case $1 in
-d|--directory)
INSTALL_DIR="$2"
shift 2
;;
-h|--help)
usage
;;
*)
echo -e "${RED}Error: Unknown option $1${NC}"
usage
;;
esac
done
# Cleanup function for rollback
cleanup() {
echo -e "${RED}Installation failed. Cleaning up...${NC}"
if [[ -d "$INSTALL_DIR" ]]; then
rm -rf "$INSTALL_DIR"
fi
}
trap cleanup ERR
# Logging
log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; }
log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
# Check if running as root
check_root() {
if [[ $EUID -eq 0 ]]; then
log_error "This script should not be run as root. Please run as a regular user with sudo privileges."
exit 1
fi
if ! sudo -n true 2>/dev/null; then
log_error "This script requires sudo privileges. Please ensure you can run sudo commands."
exit 1
fi
}
# Detect distribution
detect_distro() {
if [[ ! -f /etc/os-release ]]; then
log_error "Cannot detect Linux distribution. /etc/os-release not found."
exit 1
fi
. /etc/os-release
case "$ID" in
ubuntu|debian)
PKG_MGR="apt"
PKG_UPDATE="sudo apt update"
PKG_INSTALL="sudo apt install -y"
DOCKER_REPO_SETUP="debian"
;;
almalinux|rocky|centos|rhel|ol)
PKG_MGR="dnf"
PKG_UPDATE="sudo dnf update -y"
PKG_INSTALL="sudo dnf install -y"
DOCKER_REPO_SETUP="centos"
;;
fedora)
PKG_MGR="dnf"
PKG_UPDATE="sudo dnf update -y"
PKG_INSTALL="sudo dnf install -y"
DOCKER_REPO_SETUP="fedora"
;;
amzn)
PKG_MGR="yum"
PKG_UPDATE="sudo yum update -y"
PKG_INSTALL="sudo yum install -y"
DOCKER_REPO_SETUP="centos"
;;
*)
log_error "Unsupported distribution: $ID"
exit 1
;;
esac
log_success "Detected distribution: $PRETTY_NAME"
}
# Install Docker
install_docker() {
log_info "[1/6] Installing Docker and Docker Compose..."
# Check if Docker is already installed
if command -v docker >/dev/null 2>&1; then
log_warning "Docker is already installed, skipping..."
return 0
fi
$PKG_UPDATE
if [[ "$PKG_MGR" == "apt" ]]; then
$PKG_INSTALL ca-certificates curl gnupg lsb-release
# Add Docker GPG key
sudo mkdir -p /etc/apt/keyrings
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg
sudo chmod 644 /etc/apt/keyrings/docker.gpg
# Add Docker repository
echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt update
$PKG_INSTALL docker-ce docker-ce-cli containerd.io docker-compose-plugin
else
$PKG_INSTALL dnf-plugins-core curl
sudo dnf config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo
$PKG_INSTALL docker-ce docker-ce-cli containerd.io docker-compose-plugin
fi
# Start and enable Docker
sudo systemctl enable --now docker
sudo usermod -aG docker "$USER"
log_success "Docker installed successfully"
}
# Create directory structure
create_directories() {
log_info "[2/6] Creating monitoring directory structure..."
mkdir -p "$INSTALL_DIR"/{prometheus/{data,rules},grafana/{data,provisioning/{datasources,dashboards}},alertmanager/data}
# Set correct permissions
chmod 755 "$INSTALL_DIR"
chmod -R 755 "$INSTALL_DIR"/prometheus
chmod -R 755 "$INSTALL_DIR"/grafana
chmod -R 755 "$INSTALL_DIR"/alertmanager
# Grafana needs specific UID/GID
sudo chown -R 472:472 "$INSTALL_DIR"/grafana/data
log_success "Directory structure created"
}
# Create Prometheus configuration
create_prometheus_config() {
log_info "[3/6] Creating Prometheus configuration..."
cat > "$INSTALL_DIR/prometheus/prometheus.yml" << 'EOF'
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
monitor: 'ai-model-monitor'
rule_files:
- "rules/*.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'ai-model-metrics'
scrape_interval: 5s
metrics_path: '/metrics'
static_configs:
- targets: ['host.docker.internal:8000']
scrape_timeout: 10s
honor_labels: true
- job_name: 'node-exporter'
static_configs:
- targets: ['node-exporter:9100']
- job_name: 'cadvisor'
static_configs:
- targets: ['cadvisor:8080']
EOF
chmod 644 "$INSTALL_DIR/prometheus/prometheus.yml"
log_success "Prometheus configuration created"
}
# Create alerting rules
create_alerting_rules() {
log_info "[4/6] Creating AI model alerting rules..."
cat > "$INSTALL_DIR/prometheus/rules/ai_alerts.yml" << 'EOF'
groups:
- name: ai_model_performance
rules:
- alert: HighInferenceLatency
expr: histogram_quantile(0.95, rate(model_inference_duration_seconds_bucket[5m])) > 2.0
for: 2m
labels:
severity: warning
annotations:
summary: "High inference latency detected"
description: "95th percentile inference latency is {{ $value }}s for model {{ $labels.model_name }}"
- alert: LowModelThroughput
expr: rate(model_predictions_total[5m]) < 10
for: 3m
labels:
severity: warning
annotations:
summary: "Low model throughput"
description: "Model {{ $labels.model_name }} throughput is {{ $value }} predictions/sec"
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage detected"
description: "CPU usage is above 80% on {{ $labels.instance }}"
EOF
chmod 644 "$INSTALL_DIR/prometheus/rules/ai_alerts.yml"
log_success "Alerting rules created"
}
# Create Grafana datasource
create_grafana_config() {
log_info "[5/6] Creating Grafana configuration..."
cat > "$INSTALL_DIR/grafana/provisioning/datasources/prometheus.yml" << 'EOF'
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: true
EOF
chmod 644 "$INSTALL_DIR/grafana/provisioning/datasources/prometheus.yml"
log_success "Grafana configuration created"
}
# Create Docker Compose file
create_docker_compose() {
log_info "[6/6] Creating Docker Compose configuration..."
cat > "$INSTALL_DIR/docker-compose.yml" << 'EOF'
version: '3.8'
services:
prometheus:
image: prom/prometheus:latest
container_name: prometheus
ports:
- "9090:9090"
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./prometheus/rules:/etc/prometheus/rules:ro
- ./prometheus/data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--storage.tsdb.retention.time=30d'
- '--web.enable-lifecycle'
restart: unless-stopped
grafana:
image: grafana/grafana:latest
container_name: grafana
ports:
- "3000:3000"
volumes:
- ./grafana/data:/var/lib/grafana
- ./grafana/provisioning:/etc/grafana/provisioning
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin123
- GF_USERS_ALLOW_SIGN_UP=false
restart: unless-stopped
node-exporter:
image: prom/node-exporter:latest
container_name: node-exporter
ports:
- "9100:9100"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.rootfs=/rootfs'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
restart: unless-stopped
cadvisor:
image: gcr.io/cadvisor/cadvisor:latest
container_name: cadvisor
ports:
- "8080:8080"
volumes:
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
restart: unless-stopped
alertmanager:
image: prom/alertmanager:latest
container_name: alertmanager
ports:
- "9093:9093"
volumes:
- ./alertmanager/data:/alertmanager
restart: unless-stopped
EOF
chmod 644 "$INSTALL_DIR/docker-compose.yml"
log_success "Docker Compose configuration created"
}
# Start services
start_services() {
log_info "Starting monitoring stack..."
cd "$INSTALL_DIR"
# Use newgrp to refresh group membership for Docker
newgrp docker << EONG
docker compose up -d
EONG
log_success "Monitoring stack started successfully"
}
# Verify installation
verify_installation() {
log_info "Verifying installation..."
sleep 10 # Give services time to start
local failed=0
# Check if containers are running
if ! docker ps | grep -q prometheus; then
log_error "Prometheus container is not running"
failed=1
fi
if ! docker ps | grep -q grafana; then
log_error "Grafana container is not running"
failed=1
fi
# Check if services are accessible
if ! curl -s http://localhost:9090/-/healthy > /dev/null; then
log_error "Prometheus is not accessible"
failed=1
fi
if ! curl -s http://localhost:3000/api/health > /dev/null; then
log_error "Grafana is not accessible"
failed=1
fi
if [[ $failed -eq 0 ]]; then
log_success "All services are running correctly!"
echo ""
echo -e "${GREEN}=== Installation Complete ===${NC}"
echo -e "Prometheus: ${BLUE}http://localhost:9090${NC}"
echo -e "Grafana: ${BLUE}http://localhost:3000${NC} (admin/admin123)"
echo -e "Node Exporter: ${BLUE}http://localhost:9100${NC}"
echo -e "cAdvisor: ${BLUE}http://localhost:8080${NC}"
echo ""
echo -e "${YELLOW}Note: You may need to log out and back in for Docker group membership to take effect.${NC}"
else
log_error "Some services failed to start. Check logs with: docker compose logs"
exit 1
fi
}
# Main execution
main() {
echo -e "${GREEN}AI Monitoring Stack Installer${NC}"
echo "Installing to: $INSTALL_DIR"
echo ""
check_root
detect_distro
install_docker
create_directories
create_prometheus_config
create_alerting_rules
create_grafana_config
create_docker_compose
start_services
verify_installation
}
main "$@"
Review the script before running. Execute with: bash install.sh