Set up Grafana Enterprise high availability clustering with PostgreSQL backend and load balancing

Advanced 90 min Apr 20, 2026 19 views
Ubuntu 24.04 Debian 12 AlmaLinux 9 Rocky Linux 9

Build a production-ready Grafana Enterprise cluster with PostgreSQL shared storage, HAProxy load balancing, and SSL encryption. Includes automated failover, session persistence, and comprehensive monitoring for enterprise observability platforms.

Prerequisites

  • 3+ servers for Grafana instances
  • 2+ servers for PostgreSQL cluster
  • 1 server for HAProxy load balancer
  • Valid Grafana Enterprise license
  • SSL certificates for domain

What this solves

Grafana Enterprise high availability clustering eliminates single points of failure for your monitoring infrastructure. This setup provides automated failover, horizontal scaling, and persistent sessions across multiple Grafana instances using a shared PostgreSQL database and HAProxy load balancer.

Step-by-step configuration

Update system packages

Start by updating all servers in your cluster to ensure consistent package versions.

sudo apt update && sudo apt upgrade -y
sudo dnf update -y

Install PostgreSQL cluster

Set up a PostgreSQL cluster for shared Grafana data. Install on your designated database servers.

sudo apt install -y postgresql postgresql-contrib postgresql-client
sudo systemctl enable --now postgresql
sudo dnf install -y postgresql-server postgresql-contrib postgresql
sudo postgresql-setup --initdb
sudo systemctl enable --now postgresql

Configure PostgreSQL for clustering

Enable streaming replication and configure authentication for the Grafana database cluster.

# Primary server configuration
listen_addresses = '*'
wal_level = replica
max_wal_senders = 3
max_replication_slots = 3
archive_mode = on
archive_command = 'cp %p /var/lib/postgresql/16/main/archive/%f'
log_line_prefix = '%t [%p]: [%l-1] user=%u,db=%d,app=%a,client=%h '
log_checkpoints = on
log_connections = on
log_disconnections = on

Configure PostgreSQL authentication

Set up authentication rules for Grafana connections and replication users.

# Grafana database connections
host    grafana         grafana_user    203.0.113.10/32   md5
host    grafana         grafana_user    203.0.113.11/32   md5
host    grafana         grafana_user    203.0.113.12/32   md5

Replication connections

host replication replicator 203.0.113.20/32 md5 host replication replicator 203.0.113.21/32 md5

Create Grafana database and user

Set up the dedicated database and user account for Grafana Enterprise with proper privileges.

sudo -u postgres psql
CREATE DATABASE grafana;
CREATE USER grafana_user WITH ENCRYPTED PASSWORD 'secure_grafana_password_2024';
GRANT ALL PRIVILEGES ON DATABASE grafana TO grafana_user;
ALTER USER grafana_user CREATEDB;
\q

Set up PostgreSQL streaming replication

Configure a standby server for automatic failover. Run this on your secondary PostgreSQL server.

# On primary server - create replication user
sudo -u postgres psql
CREATE USER replicator WITH REPLICATION ENCRYPTED PASSWORD 'replication_password_2024';
\q

On standby server - create base backup

sudo systemctl stop postgresql sudo rm -rf /var/lib/postgresql/16/main/* sudo -u postgres pg_basebackup -h 203.0.113.20 -D /var/lib/postgresql/16/main -U replicator -W -v -P sudo -u postgres touch /var/lib/postgresql/16/main/standby.signal

Configure standby server

Set up recovery configuration for the PostgreSQL standby instance.

primary_conninfo = 'host=203.0.113.20 port=5432 user=replicator password=replication_password_2024'
restore_command = 'cp /var/lib/postgresql/16/main/archive/%f %p'
recovery_target_timeline = 'latest'

Install Grafana Enterprise

Install Grafana Enterprise on your application servers. You'll need a valid license key.

wget -q -O /usr/share/keyrings/grafana.key https://apt.grafana.com/gpg.key
echo "deb [signed-by=/usr/share/keyrings/grafana.key] https://apt.grafana.com stable main" | sudo tee -a /etc/apt/sources.list.d/grafana.list
sudo apt update
sudo apt install -y grafana-enterprise
sudo tee /etc/yum.repos.d/grafana.repo <

Configure Grafana Enterprise for clustering

Set up each Grafana instance with PostgreSQL backend and clustering parameters. Use this config on all Grafana servers.

[server]
http_port = 3000
domain = grafana.example.com
root_url = https://grafana.example.com
serve_from_sub_path = false

[database]
type = postgres
host = 203.0.113.20:5432
name = grafana
user = grafana_user
password = secure_grafana_password_2024
ssl_mode = require
max_open_conn = 300
max_idle_conn = 300
conn_max_lifetime = 14400

[session]
provider = postgres
provider_config = user=grafana_user password=secure_grafana_password_2024 host=203.0.113.20 port=5432 dbname=grafana sslmode=require
cookie_secure = true
session_life_time = 86400

[security]
admin_user = admin
admin_password = secure_admin_password_2024
secret_key = very_long_random_secret_key_for_clustering_2024
cookie_secure = true
cookie_samesite = strict
strict_transport_security = true

[auth.anonymous]
enabled = false

[log]
mode = file
level = info
format = json

[alerting]
enabled = true
execute_alerts = true

[unified_alerting]
enabled = true

[enterprise]
license_path = /etc/grafana/license.jwt

Install and configure HAProxy

Set up HAProxy for load balancing across your Grafana Enterprise instances.

sudo apt install -y haproxy
sudo dnf install -y haproxy

Configure HAProxy load balancing

Set up sticky sessions and health checks for your Grafana Enterprise cluster.

global
    chroot /var/lib/haproxy
    stats socket /run/haproxy/admin.sock mode 660 level admin
    stats timeout 30s
    user haproxy
    group haproxy
    daemon
    log stdout local0
    ssl-default-bind-ciphers ECDHE+AESGCM:ECDHE+CHACHA20:DHE+AESGCM:DHE+CHACHA20:!aNULL:!SHA1:!AESCCM
    ssl-default-bind-options no-sslv3 no-tlsv10 no-tlsv11
    tune.ssl.default-dh-param 2048

defaults
    mode http
    log global
    option httplog
    option dontlognull
    option log-health-checks
    option forwardfor
    option http-server-close
    timeout connect 10s
    timeout client 300s
    timeout server 300s
    timeout http-keep-alive 10s
    timeout check 10s
    maxconn 3000
    default-server inter 10s downinter 5s rise 2 fall 2 slowstart 60s maxconn 250 maxqueue 256 weight 100

frontend grafana_frontend
    bind *:80
    bind *:443 ssl crt /etc/ssl/certs/grafana.example.com.pem
    redirect scheme https if !{ ssl_fc }
    
    # Security headers
    http-response set-header Strict-Transport-Security "max-age=31536000; includeSubDomains; preload"
    http-response set-header X-Frame-Options "DENY"
    http-response set-header X-Content-Type-Options "nosniff"
    http-response set-header X-XSS-Protection "1; mode=block"
    
    default_backend grafana_backend

backend grafana_backend
    balance roundrobin
    cookie GRAFANA_SESSION_ID prefix nocache
    
    # Health check
    option httpchk GET /api/health
    http-check expect status 200
    
    server grafana1 203.0.113.10:3000 check cookie grafana1 ssl verify none
    server grafana2 203.0.113.11:3000 check cookie grafana2 ssl verify none
    server grafana3 203.0.113.12:3000 check cookie grafana3 ssl verify none

listen stats
    bind *:8404
    stats enable
    stats uri /stats
    stats refresh 30s
    stats admin if TRUE

Generate SSL certificates

Create SSL certificates for encrypted communication. For production, use Let's Encrypt or your CA.

# Generate self-signed certificate for testing
sudo mkdir -p /etc/ssl/certs
sudo openssl req -x509 -nodes -days 365 -newkey rsa:2048 \
    -keyout /etc/ssl/certs/grafana.example.com.key \
    -out /etc/ssl/certs/grafana.example.com.crt \
    -subj "/C=US/ST=State/L=City/O=Organization/CN=grafana.example.com"

Combine for HAProxy

sudo cat /etc/ssl/certs/grafana.example.com.crt /etc/ssl/certs/grafana.example.com.key > /etc/ssl/certs/grafana.example.com.pem sudo chmod 600 /etc/ssl/certs/grafana.example.com.pem

Configure firewall rules

Open required ports for cluster communication and monitoring access.

# HAProxy
sudo ufw allow 80/tcp
sudo ufw allow 443/tcp
sudo ufw allow 8404/tcp

PostgreSQL

sudo ufw allow from 203.0.113.0/24 to any port 5432

Grafana inter-node

sudo ufw allow from 203.0.113.0/24 to any port 3000
# HAProxy
sudo firewall-cmd --permanent --add-port=80/tcp
sudo firewall-cmd --permanent --add-port=443/tcp
sudo firewall-cmd --permanent --add-port=8404/tcp

PostgreSQL

sudo firewall-cmd --permanent --add-rich-rule="rule family=ipv4 source address=203.0.113.0/24 port protocol=tcp port=5432 accept"

Grafana inter-node

sudo firewall-cmd --permanent --add-rich-rule="rule family=ipv4 source address=203.0.113.0/24 port protocol=tcp port=3000 accept" sudo firewall-cmd --reload

Initialize database schema

Run database migrations on one Grafana instance to set up the schema.

# On first Grafana server only
sudo systemctl start grafana-server
sudo systemctl enable grafana-server

Check logs to ensure successful startup

sudo journalctl -u grafana-server -f

Start all cluster services

Enable and start all services in the correct order across your cluster.

# PostgreSQL servers
sudo systemctl restart postgresql
sudo systemctl status postgresql

All Grafana servers

sudo systemctl enable --now grafana-server sudo systemctl status grafana-server

HAProxy server

sudo systemctl enable --now haproxy sudo systemctl status haproxy

Install Grafana Enterprise license

Upload your Enterprise license to enable clustering features on all nodes.

# Copy license file to each Grafana server
sudo cp /path/to/license.jwt /etc/grafana/license.jwt
sudo chown grafana:grafana /etc/grafana/license.jwt
sudo chmod 644 /etc/grafana/license.jwt

Restart to load license

sudo systemctl restart grafana-server

Configure monitoring and alerting

Enable HAProxy statistics

Configure monitoring for your load balancer to track cluster health and performance.

# Access HAProxy stats at http://your-haproxy-server:8404/stats
curl -s http://203.0.113.100:8404/stats | head -10

Set up database monitoring

Monitor PostgreSQL replication status and connection health.

# Check replication status on primary
sudo -u postgres psql -c "SELECT client_addr, state, sync_state FROM pg_stat_replication;"

Check replication status on standby

sudo -u postgres psql -c "SELECT pg_is_in_recovery(), pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn();"

Configure log aggregation

Set up centralized logging for troubleshooting cluster issues. This integrates with existing log management systems.

# HAProxy logs
$ModLoad imudp
$UDPServerRun 514
$UDPServerAddress 127.0.0.1
local0.*    /var/log/haproxy.log
& stop

PostgreSQL logs

local0.* /var/log/postgresql/cluster.log

Verify your setup

# Check cluster status
curl -k https://grafana.example.com/api/health
curl -s http://203.0.113.100:8404/stats | grep -E "grafana[1-3]"

Test failover

sudo systemctl stop grafana-server # on one node curl -k https://grafana.example.com/api/health # should still work

Check database connectivity

sudo -u postgres psql -h 203.0.113.20 -d grafana -c "SELECT count(*) FROM dashboard;"

Verify session persistence

curl -c cookies.txt -k https://grafana.example.com/login curl -b cookies.txt -k https://grafana.example.com/api/user

Common issues

SymptomCauseFix
502 Bad Gateway from HAProxyGrafana instances down or unreachableCheck sudo systemctl status grafana-server and firewall rules
Database connection errorsPostgreSQL authentication or network issuesVerify pg_hba.conf rules and test with psql -h host -U user -d grafana
Session not persisting across nodesDatabase session storage not configuredCheck session provider config in grafana.ini and database connectivity
License activation failuresInvalid license file or permissionsVerify license file permissions and check journalctl -u grafana-server
SSL certificate errorsCertificate mismatch or expiredCheck certificate validity with openssl x509 -in cert.pem -text -noout
PostgreSQL replication lagNetwork latency or high write loadMonitor with SELECT pg_stat_replication and optimize wal settings

Security hardening

Enable database SSL encryption

Configure PostgreSQL SSL for encrypted communication between Grafana and database.

# Generate SSL certificates for PostgreSQL
sudo -u postgres openssl req -new -x509 -days 365 -nodes -text -out /var/lib/postgresql/server.crt -keyout /var/lib/postgresql/server.key -subj "/CN=postgres.example.com"
sudo chmod 600 /var/lib/postgresql/server.key
sudo chmod 644 /var/lib/postgresql/server.crt

Configure authentication security

Implement additional security measures for production deployments.

[auth]
disable_login_form = false
disable_signout_menu = false
signout_redirect_url = https://grafana.example.com/login
oauth_auto_login = false

[auth.basic]
enabled = true

[security]
disable_gravatar = true
cookie_samesite = strict
allow_embedding = false
angular_support_enabled = false

[users]
allow_sign_up = false
allow_org_create = false
auto_assign_org = true
auto_assign_org_role = Viewer

Performance optimization

Optimize PostgreSQL for Grafana workload

Tune database parameters for optimal Grafana Enterprise performance.

# Memory settings
shared_buffers = '512MB'
effective_cache_size = '1GB'
work_mem = '16MB'
maintenance_work_mem = '128MB'

Connection settings

max_connections = 300 idle_in_transaction_session_timeout = 300000

Performance settings

random_page_cost = 1.1 effective_io_concurrency = 200 max_worker_processes = 8 max_parallel_workers_per_gather = 4 max_parallel_workers = 8

Configure HAProxy performance tuning

Optimize load balancer settings for high-traffic Grafana deployments.

# Add to global section
nbproc 1
nbthread 4
cpu-map auto:1/1-4 0-3
ssl-server-verify none

Add to defaults section

timeout http-request 10s timeout queue 1m timeout tarpit 10s compression algo gzip compression type text/html text/plain text/css text/javascript application/javascript application/json

Next steps

Running this in production?

Want this handled for you? Running this at scale adds a second layer of work: capacity planning, failover drills, cost control, and on-call. Our managed platform covers monitoring, backups and 24/7 response by default.

Automated install script

Run this to automate the entire setup

Need help?

Don't want to manage this yourself?

We handle managed devops services for businesses that depend on uptime. From initial setup to ongoing operations.