Build a production-grade Zabbix 7 high availability cluster with PostgreSQL streaming replication, HAProxy load balancing, and automated failover capabilities for enterprise monitoring.
Prerequisites
- At least 6 servers (2 database, 3 Zabbix, 1 HAProxy)
- Root or sudo access on all servers
- Network connectivity between all nodes
- Basic understanding of PostgreSQL and load balancing
What this solves
This tutorial helps you build a production-grade Zabbix 7 high availability cluster that eliminates single points of failure in your monitoring infrastructure. You'll configure PostgreSQL streaming replication for database redundancy, set up multiple Zabbix servers with automated failover, and implement HAProxy load balancing for the web frontend. This setup ensures your monitoring remains operational even during server failures, database outages, or maintenance windows.
Step-by-step configuration
Install PostgreSQL on database servers
Install PostgreSQL 15 on two dedicated database servers that will serve as primary and replica nodes.
sudo apt update
sudo apt install -y postgresql-15 postgresql-contrib-15
sudo systemctl enable --now postgresql
Configure PostgreSQL primary server
Configure the primary database server to enable streaming replication and create the Zabbix database.
listen_addresses = '*'
wal_level = replica
max_wal_senders = 3
max_replication_slots = 3
archive_mode = on
archive_command = 'cp %p /var/lib/postgresql/15/main/archive/%f'
hot_standby = on
shared_preload_libraries = 'pg_stat_statements'
max_connections = 200
# Replication connections
host replication zabbix_repl 203.0.113.11/32 md5
host replication zabbix_repl 203.0.113.12/32 md5
Zabbix server connections
host zabbix zabbix 203.0.113.21/32 md5
host zabbix zabbix 203.0.113.22/32 md5
host zabbix zabbix 203.0.113.23/32 md5
sudo mkdir -p /var/lib/postgresql/15/main/archive
sudo chown postgres:postgres /var/lib/postgresql/15/main/archive
sudo systemctl restart postgresql
Create Zabbix database and users
Create the Zabbix database, replication user, and application user on the primary PostgreSQL server.
sudo -u postgres psql -c "CREATE USER zabbix_repl REPLICATION LOGIN PASSWORD 'repl_password_here';"
sudo -u postgres psql -c "CREATE USER zabbix PASSWORD 'zabbix_password_here';"
sudo -u postgres psql -c "CREATE DATABASE zabbix OWNER zabbix ENCODING 'UTF8' LC_COLLATE 'en_US.UTF-8' LC_CTYPE 'en_US.UTF-8' TEMPLATE template0;"
sudo -u postgres psql -c "GRANT ALL PRIVILEGES ON DATABASE zabbix TO zabbix;"
Set up PostgreSQL replica server
Configure the replica database server to stream changes from the primary server.
sudo systemctl stop postgresql
sudo -u postgres rm -rf /var/lib/postgresql/15/main/*
sudo -u postgres pg_basebackup -h 203.0.113.10 -D /var/lib/postgresql/15/main -U zabbix_repl -v -P -W -R
hot_standby = on
max_connections = 200
sudo systemctl start postgresql
sudo -u postgres psql -c "SELECT pg_is_in_recovery();"
Install Zabbix server packages
Install Zabbix 7.0 server packages on three application servers for high availability clustering.
wget https://repo.zabbix.com/zabbix/7.0/ubuntu/pool/main/z/zabbix-release/zabbix-release_7.0-2+ubuntu24.04_all.deb
sudo dpkg -i zabbix-release_7.0-2+ubuntu24.04_all.deb
sudo apt update
sudo apt install -y zabbix-server-pgsql zabbix-frontend-php zabbix-apache-conf zabbix-sql-scripts zabbix-agent2
Initialize Zabbix database schema
Import the initial Zabbix database schema only on the primary PostgreSQL server.
sudo zcat /usr/share/zabbix-sql-scripts/postgresql/server.sql.gz | sudo -u zabbix psql -h 203.0.113.10 zabbix
Configure Zabbix server for high availability
Configure each Zabbix server node with HA settings and database connection details.
LogFile=/var/log/zabbix/zabbix_server.log
LogFileSize=0
PidFile=/run/zabbix/zabbix_server.pid
SocketDir=/run/zabbix
DBHost=203.0.113.10
DBName=zabbix
DBUser=zabbix
DBPassword=zabbix_password_here
DBPort=5432
StartPollers=10
StartPollersUnreachable=2
StartTrappers=5
StartPingers=2
StartDiscoverers=2
StartHTTPPollers=2
StartTimers=2
StartEscalators=2
StartAlerters=3
HousekeepingFrequency=1
MaxHousekeeperDelete=50000
CacheSize=32M
CacheUpdateFrequency=60
StartDBSyncers=4
HistoryCacheSize=64M
HistoryIndexCacheSize=16M
TrendCacheSize=16M
ValueCacheSize=32M
Timeout=4
TrapperTimeout=300
UnreachablePeriod=45
UnavailableDelay=60
UnreachableDelay=15
LogLevel=3
HANodeName=zabbix-node-01
NodeAddress=203.0.113.21:10051
Configure second Zabbix server node
Configure the second Zabbix server with unique HA node settings.
# Same configuration as node 1, but change:
HANodeName=zabbix-node-02
NodeAddress=203.0.113.22:10051
Configure third Zabbix server node
Configure the third Zabbix server to complete the HA cluster setup.
# Same configuration as node 1, but change:
HANodeName=zabbix-node-03
NodeAddress=203.0.113.23:10051
Install and configure HAProxy
Set up HAProxy on a dedicated server to load balance Zabbix frontend requests.
sudo apt install -y haproxy
global
log 127.0.0.1:514 local0
chroot /var/lib/haproxy
stats socket /run/haproxy/admin.sock mode 660 level admin
stats timeout 30s
user haproxy
group haproxy
daemon
defaults
mode http
log global
option httplog
option dontlognull
option log-health-checks
option forwardfor
option http-server-close
timeout connect 10s
timeout client 1m
timeout server 1m
timeout http-keep-alive 10s
timeout check 10s
maxconn 3000
frontend zabbix_frontend
bind *:80
bind *:443 ssl crt /etc/ssl/certs/zabbix.pem
redirect scheme https if !{ ssl_fc }
default_backend zabbix_servers
backend zabbix_servers
balance roundrobin
option httpchk GET /zabbix/api_jsonrpc.php
http-check expect status 200
server zabbix1 203.0.113.21:80 check inter 5000 rise 2 fall 3
server zabbix2 203.0.113.22:80 check inter 5000 rise 2 fall 3
server zabbix3 203.0.113.23:80 check inter 5000 rise 2 fall 3
listen stats
bind *:8404
stats enable
stats uri /stats
stats refresh 30s
stats admin if TRUE
Configure database failover script
Create a script to promote the PostgreSQL replica to primary during failover scenarios.
#!/bin/bash
PRIMARY_HOST="203.0.113.10"
REPLICA_HOST="203.0.113.11"
ZABBIX_SERVERS=("203.0.113.21" "203.0.113.22" "203.0.113.23")
LOGFILE="/var/log/postgres-failover.log"
log_message() {
echo "$(date): $1" >> "$LOGFILE"
}
check_primary() {
pg_isready -h "$PRIMARY_HOST" -p 5432 -U postgres >/dev/null 2>&1
}
promote_replica() {
log_message "Promoting replica to primary"
ssh postgres@"$REPLICA_HOST" "pg_ctl promote -D /var/lib/postgresql/15/main"
# Update Zabbix server configurations
for server in "${ZABBIX_SERVERS[@]}"; do
ssh root@"$server" "sed -i 's/DBHost=$PRIMARY_HOST/DBHost=$REPLICA_HOST/' /etc/zabbix/zabbix_server.conf"
ssh root@"$server" "systemctl restart zabbix-server"
done
log_message "Failover completed successfully"
}
if ! check_primary; then
log_message "Primary PostgreSQL server is down, initiating failover"
promote_replica
else
log_message "Primary PostgreSQL server is healthy"
fi
sudo chmod +x /usr/local/bin/postgres-failover.sh
sudo chown root:root /usr/local/bin/postgres-failover.sh
Set up automated health checking
Configure systemd timer for regular health checks and automated failover.
[Unit]
Description=PostgreSQL Health Check and Failover
After=network.target
[Service]
Type=oneshot
User=root
ExecStart=/usr/local/bin/postgres-failover.sh
StandardOutput=journal
StandardError=journal
[Unit]
Description=Run PostgreSQL health check every 30 seconds
Requires=postgres-health-check.service
[Timer]
OnBootSec=30
OnUnitActiveSec=30
[Install]
WantedBy=timers.target
sudo systemctl daemon-reload
sudo systemctl enable --now postgres-health-check.timer
Start all services
Enable and start all Zabbix servers and HAProxy to complete the cluster setup.
# On each Zabbix server
sudo systemctl enable --now zabbix-server zabbix-agent2 apache2
On HAProxy server
sudo systemctl enable --now haproxy
Configure firewall rules
sudo ufw allow 80/tcp
sudo ufw allow 443/tcp
sudo ufw allow 10051/tcp
sudo ufw allow 5432/tcp
Configure Zabbix web frontend
Complete the initial Zabbix web setup through the browser interface.
Verify your setup
# Check PostgreSQL replication status
sudo -u postgres psql -c "SELECT * FROM pg_stat_replication;"
Verify Zabbix server HA nodes
sudo -u postgres psql zabbix -c "SELECT * FROM ha_node;"
Check HAProxy stats
curl http://203.0.113.30:8404/stats
Test database connection from Zabbix servers
sudo -u zabbix psql -h 203.0.113.10 -d zabbix -c "SELECT count(*) FROM users;"
Verify all services are running
sudo systemctl status zabbix-server zabbix-agent2 postgresql haproxy
Test automated failover
Simulate database failure
Test the automated failover by stopping the primary PostgreSQL server.
# On primary database server
sudo systemctl stop postgresql
Monitor failover logs
tail -f /var/log/postgres-failover.log
Check that replica is now primary
sudo -u postgres psql -h 203.0.113.11 -c "SELECT pg_is_in_recovery();"
Verify Zabbix servers switched to new database
grep DBHost /etc/zabbix/zabbix_server.conf
Test Zabbix server failover
Verify HAProxy automatically routes traffic when Zabbix servers fail.
# Stop one Zabbix server
sudo systemctl stop zabbix-server
Check HAProxy removes failed server from pool
echo "show stat" | socat stdio /run/haproxy/admin.sock
Verify web interface remains accessible
curl -I https://203.0.113.30/zabbix/
Common issues
| Symptom | Cause | Fix |
|---|---|---|
| Replication not working | Incorrect pg_hba.conf settings | Verify replication user permissions and network access rules |
| Zabbix HA nodes not visible | NodeAddress misconfiguration | Check NodeAddress matches actual server IP in zabbix_server.conf |
| Database connection fails | Wrong database credentials | Verify DBUser and DBPassword in Zabbix configuration match PostgreSQL user |
| HAProxy health checks fail | Apache not responding | Ensure Apache is running and Zabbix frontend is accessible |
| Failover script not executing | SSH key authentication missing | Set up passwordless SSH keys between monitoring and target servers |
| Frontend shows database error | Database schema not imported | Import server.sql.gz into zabbix database on primary PostgreSQL |
Next steps
Automated install script
Run this to automate the entire setup
#!/usr/bin/env bash
set -euo pipefail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
# Globals
SCRIPT_NAME=$(basename "$0")
LOG_FILE="/var/log/zabbix-ha-install.log"
# Usage function
usage() {
echo "Usage: $SCRIPT_NAME [OPTIONS]"
echo "Options:"
echo " --role ROLE Node role: db-primary, db-replica, zabbix-server, haproxy"
echo " --primary-db-ip IP Primary database IP address"
echo " --replica-db-ip IP Replica database IP address (for replication)"
echo " --zabbix-ips RANGE Comma-separated Zabbix server IPs"
echo " --node-id ID Zabbix node ID (1-3 for servers)"
echo " --help Show this help message"
echo ""
echo "Examples:"
echo " $SCRIPT_NAME --role db-primary --replica-db-ip 10.0.1.11"
echo " $SCRIPT_NAME --role db-replica --primary-db-ip 10.0.1.10"
echo " $SCRIPT_NAME --role zabbix-server --primary-db-ip 10.0.1.10 --node-id 1"
exit 1
}
# Logging function
log() {
echo -e "${GREEN}[$(date +'%Y-%m-%d %H:%M:%S')]${NC} $1" | tee -a "$LOG_FILE"
}
error() {
echo -e "${RED}[ERROR]${NC} $1" | tee -a "$LOG_FILE"
exit 1
}
warn() {
echo -e "${YELLOW}[WARNING]${NC} $1" | tee -a "$LOG_FILE"
}
# Cleanup function
cleanup() {
if [ $? -ne 0 ]; then
error "Installation failed. Check $LOG_FILE for details."
fi
}
trap cleanup ERR
# Check prerequisites
check_prerequisites() {
if [ "$EUID" -ne 0 ]; then
error "This script must be run as root"
fi
if ! command -v curl &> /dev/null; then
error "curl is required but not installed"
fi
}
# Detect distribution
detect_distro() {
if [ -f /etc/os-release ]; then
. /etc/os-release
case "$ID" in
ubuntu|debian)
PKG_MGR="apt"
PKG_INSTALL="apt install -y"
PKG_UPDATE="apt update"
PG_VERSION="15"
PG_DATA_DIR="/var/lib/postgresql/15/main"
PG_CONFIG_DIR="/etc/postgresql/15/main"
PG_SERVICE="postgresql"
FIREWALL_CMD="ufw"
;;
almalinux|rocky|centos|rhel|ol)
PKG_MGR="dnf"
PKG_INSTALL="dnf install -y"
PKG_UPDATE="dnf update -y"
PG_VERSION="15"
PG_DATA_DIR="/var/lib/pgsql/15/data"
PG_CONFIG_DIR="/var/lib/pgsql/15/data"
PG_SERVICE="postgresql-15"
FIREWALL_CMD="firewall-cmd"
;;
fedora)
PKG_MGR="dnf"
PKG_INSTALL="dnf install -y"
PKG_UPDATE="dnf update -y"
PG_VERSION="15"
PG_DATA_DIR="/var/lib/pgsql/data"
PG_CONFIG_DIR="/var/lib/pgsql/data"
PG_SERVICE="postgresql"
FIREWALL_CMD="firewall-cmd"
;;
amzn)
PKG_MGR="yum"
PKG_INSTALL="yum install -y"
PKG_UPDATE="yum update -y"
PG_VERSION="15"
PG_DATA_DIR="/var/lib/pgsql/15/data"
PG_CONFIG_DIR="/var/lib/pgsql/15/data"
PG_SERVICE="postgresql-15"
FIREWALL_CMD="firewall-cmd"
;;
*)
error "Unsupported distribution: $ID"
;;
esac
else
error "Cannot detect distribution"
fi
log "Detected distribution: $PRETTY_NAME"
}
# Parse arguments
parse_args() {
ROLE=""
PRIMARY_DB_IP=""
REPLICA_DB_IP=""
ZABBIX_IPS=""
NODE_ID=""
while [[ $# -gt 0 ]]; do
case $1 in
--role)
ROLE="$2"
shift 2
;;
--primary-db-ip)
PRIMARY_DB_IP="$2"
shift 2
;;
--replica-db-ip)
REPLICA_DB_IP="$2"
shift 2
;;
--zabbix-ips)
ZABBIX_IPS="$2"
shift 2
;;
--node-id)
NODE_ID="$2"
shift 2
;;
--help)
usage
;;
*)
error "Unknown option: $1"
;;
esac
done
if [ -z "$ROLE" ]; then
error "Role is required. Use --help for usage information."
fi
}
# Install PostgreSQL
install_postgresql() {
log "[1/8] Installing PostgreSQL"
case "$PKG_MGR" in
apt)
$PKG_UPDATE
$PKG_INSTALL postgresql-15 postgresql-contrib-15
systemctl enable --now postgresql
;;
dnf|yum)
$PKG_INSTALL postgresql15-server postgresql15-contrib
if [ "$PKG_MGR" = "dnf" ]; then
/usr/pgsql-15/bin/postgresql-15-setup initdb
fi
systemctl enable --now postgresql-15
;;
esac
}
# Configure PostgreSQL primary
configure_pg_primary() {
log "[2/8] Configuring PostgreSQL primary server"
mkdir -p "${PG_DATA_DIR}/archive"
chown postgres:postgres "${PG_DATA_DIR}/archive"
cat >> "${PG_CONFIG_DIR}/postgresql.conf" << EOF
listen_addresses = '*'
wal_level = replica
max_wal_senders = 3
max_replication_slots = 3
archive_mode = on
archive_command = 'cp %p ${PG_DATA_DIR}/archive/%f'
hot_standby = on
shared_preload_libraries = 'pg_stat_statements'
max_connections = 200
EOF
cat >> "${PG_CONFIG_DIR}/pg_hba.conf" << EOF
host replication zabbix_repl ${REPLICA_DB_IP}/32 md5
host zabbix zabbix ${ZABBIX_IPS}/24 md5
EOF
systemctl restart "$PG_SERVICE"
# Create users and database
sudo -u postgres psql -c "CREATE USER zabbix_repl REPLICATION LOGIN PASSWORD 'repl_password_here';"
sudo -u postgres psql -c "CREATE USER zabbix PASSWORD 'zabbix_password_here';"
sudo -u postgres psql -c "CREATE DATABASE zabbix OWNER zabbix ENCODING 'UTF8' LC_COLLATE 'en_US.UTF-8' LC_CTYPE 'en_US.UTF-8' TEMPLATE template0;"
sudo -u postgres psql -c "GRANT ALL PRIVILEGES ON DATABASE zabbix TO zabbix;"
}
# Configure PostgreSQL replica
configure_pg_replica() {
log "[3/8] Configuring PostgreSQL replica server"
systemctl stop "$PG_SERVICE"
sudo -u postgres rm -rf "${PG_DATA_DIR}"/*
sudo -u postgres PGPASSWORD='repl_password_here' pg_basebackup -h "$PRIMARY_DB_IP" -D "$PG_DATA_DIR" -U zabbix_repl -v -P -R
cat >> "${PG_CONFIG_DIR}/postgresql.conf" << EOF
hot_standby = on
max_connections = 200
EOF
systemctl start "$PG_SERVICE"
# Verify replication
if sudo -u postgres psql -c "SELECT pg_is_in_recovery();" | grep -q "t"; then
log "PostgreSQL replica configured successfully"
else
error "PostgreSQL replica configuration failed"
fi
}
# Install Zabbix
install_zabbix() {
log "[4/8] Installing Zabbix server"
case "$PKG_MGR" in
apt)
if [ "$ID" = "ubuntu" ]; then
wget -q https://repo.zabbix.com/zabbix/7.0/ubuntu/pool/main/z/zabbix-release/zabbix-release_7.0-2+ubuntu24.04_all.deb
dpkg -i zabbix-release_7.0-2+ubuntu24.04_all.deb
else
wget -q https://repo.zabbix.com/zabbix/7.0/debian/pool/main/z/zabbix-release/zabbix-release_7.0-2+debian12_all.deb
dpkg -i zabbix-release_7.0-2+debian12_all.deb
fi
$PKG_UPDATE
$PKG_INSTALL zabbix-server-pgsql zabbix-frontend-php zabbix-apache-conf zabbix-sql-scripts zabbix-agent2
;;
dnf|yum)
if [ "$ID" = "almalinux" ] || [ "$ID" = "rocky" ]; then
rpm -Uvh https://repo.zabbix.com/zabbix/7.0/alma/9/x86_64/zabbix-release-7.0-2.el9.noarch.rpm
else
rpm -Uvh https://repo.zabbix.com/zabbix/7.0/rhel/9/x86_64/zabbix-release-7.0-2.el9.noarch.rpm
fi
$PKG_MGR clean all
$PKG_INSTALL zabbix-server-pgsql zabbix-web-pgsql zabbix-apache-conf zabbix-sql-scripts zabbix-selinux-policy zabbix-agent2
;;
esac
}
# Initialize Zabbix database
init_zabbix_db() {
if [ "$NODE_ID" = "1" ]; then
log "[5/8] Initializing Zabbix database schema"
zcat /usr/share/zabbix-sql-scripts/postgresql/server.sql.gz | sudo -u zabbix PGPASSWORD='zabbix_password_here' psql -h "$PRIMARY_DB_IP" zabbix
fi
}
# Configure Zabbix server
configure_zabbix_server() {
log "[6/8] Configuring Zabbix server for HA"
cat > /etc/zabbix/zabbix_server.conf << EOF
LogFile=/var/log/zabbix/zabbix_server.log
LogFileSize=0
PidFile=/run/zabbix/zabbix_server.pid
SocketDir=/run/zabbix
DBHost=${PRIMARY_DB_IP}
DBName=zabbix
DBUser=zabbix
DBPassword=zabbix_password_here
HANodeName=zabbix-server-${NODE_ID}
NodeAddress=$(hostname -I | awk '{print $1}')
StartPollers=5
StartPollersUnreachable=1
StartTrappers=5
StartPingers=1
StartDiscoverers=1
StartHTTPPollers=1
CacheSize=32M
HistoryCacheSize=16M
HistoryIndexCacheSize=4M
TrendCacheSize=4M
ValueCacheSize=8M
Timeout=4
UnreachablePeriod=45
UnavailableDelay=60
UnreachableDelay=15
LogSlowQueries=3000
EOF
systemctl enable --now zabbix-server zabbix-agent2
}
# Configure firewall
configure_firewall() {
log "[7/8] Configuring firewall"
case "$FIREWALL_CMD" in
ufw)
ufw --force enable
case "$ROLE" in
db-primary|db-replica)
ufw allow 5432/tcp
;;
zabbix-server)
ufw allow 10051/tcp
ufw allow 80/tcp
ufw allow 443/tcp
;;
haproxy)
ufw allow 80/tcp
ufw allow 443/tcp
ufw allow 8404/tcp
;;
esac
;;
firewall-cmd)
systemctl enable --now firewalld
case "$ROLE" in
db-primary|db-replica)
firewall-cmd --permanent --add-port=5432/tcp
;;
zabbix-server)
firewall-cmd --permanent --add-port=10051/tcp
firewall-cmd --permanent --add-service=http
firewall-cmd --permanent --add-service=https
;;
haproxy)
firewall-cmd --permanent --add-service=http
firewall-cmd --permanent --add-service=https
firewall-cmd --permanent --add-port=8404/tcp
;;
esac
firewall-cmd --reload
;;
esac
}
# Verify installation
verify_installation() {
log "[8/8] Verifying installation"
case "$ROLE" in
db-primary|db-replica)
if systemctl is-active --quiet "$PG_SERVICE"; then
log "PostgreSQL service is running"
else
error "PostgreSQL service is not running"
fi
;;
zabbix-server)
if systemctl is-active --quiet zabbix-server; then
log "Zabbix server service is running"
else
error "Zabbix server service is not running"
fi
;;
esac
log "Installation completed successfully!"
log "Check the log file at $LOG_FILE for detailed information."
}
# Main execution
main() {
check_prerequisites
detect_distro
parse_args "$@"
case "$ROLE" in
db-primary)
install_postgresql
configure_pg_primary
;;
db-replica)
install_postgresql
configure_pg_replica
;;
zabbix-server)
install_zabbix
init_zabbix_db
configure_zabbix_server
;;
*)
error "Invalid role: $ROLE"
;;
esac
configure_firewall
verify_installation
}
main "$@"
Review the script before running. Execute with: bash install.sh