Set up Envoy Proxy with intelligent gRPC load balancing, health checks, and circuit breakers for production microservices. Includes SSL termination, monitoring integration, and security hardening.
Prerequisites
- Root or sudo access
- Basic understanding of gRPC and microservices
- Network connectivity between Envoy and backend services
- At least 2GB RAM for testing
What this solves
Envoy Proxy provides production-grade load balancing for gRPC services with built-in health checks, circuit breakers, and observability. This configuration handles automatic failover, prevents cascade failures, and gives you detailed metrics on service performance.
Step-by-step configuration
Install Envoy Proxy
Add the official Envoy repository and install the latest stable version.
sudo apt update
curl -sL 'https://deb.dl.getenvoy.io/public/gpg.8115BA8E629CC074.key' | sudo gpg --dearmor -o /usr/share/keyrings/getenvoy-keyring.gpg
echo "deb [arch=amd64 signed-by=/usr/share/keyrings/getenvoy-keyring.gpg] https://deb.dl.getenvoy.io/public/deb/ubuntu $(lsb_release -cs) main" | sudo tee /etc/apt/sources.list.d/getenvoy.list
sudo apt update && sudo apt install -y getenvoy-envoy
Create Envoy user and directories
Set up a dedicated user and directory structure for security isolation.
sudo useradd --system --shell /bin/false --home-dir /var/lib/envoy --create-home envoy
sudo mkdir -p /etc/envoy /var/log/envoy
sudo chown -R envoy:envoy /var/lib/envoy /var/log/envoy
sudo chmod 755 /etc/envoy
Configure main Envoy configuration
Create the primary configuration file with admin interface, listeners, and cluster definitions.
admin:
address:
socket_address:
address: 127.0.0.1
port_value: 9901
access_log:
- name: envoy.access_loggers.file
typed_config:
"@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog
path: "/var/log/envoy/admin.log"
static_resources:
listeners:
- name: grpc_listener
address:
socket_address:
address: 0.0.0.0
port_value: 8080
filter_chains:
- filters:
- name: envoy.filters.network.http_connection_manager
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager
stat_prefix: grpc_proxy
codec_type: HTTP2
access_log:
- name: envoy.access_loggers.file
typed_config:
"@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog
path: "/var/log/envoy/access.log"
format: |
[%START_TIME%] "%REQ(:METHOD)% %REQ(X-ENVOY-ORIGINAL-PATH?:PATH)% %PROTOCOL%"
%RESPONSE_CODE% %RESPONSE_FLAGS% %BYTES_RECEIVED% %BYTES_SENT%
%DURATION% %RESP(X-ENVOY-UPSTREAM-SERVICE-TIME)% "%REQ(X-FORWARDED-FOR)%"
"%REQ(USER-AGENT)%" "%REQ(X-REQUEST-ID)%" "%REQ(:AUTHORITY)%" "%UPSTREAM_HOST%"
http_filters:
- name: envoy.filters.http.grpc_stats
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.grpc_stats.v3.FilterConfig
emit_filter_state: true
- name: envoy.filters.http.router
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
route_config:
name: local_route
virtual_hosts:
- name: grpc_backend
domains: ["*"]
routes:
- match:
prefix: "/"
route:
cluster: grpc_cluster
timeout: 30s
retry_policy:
retry_on: "5xx,reset,connect-failure,refused-stream"
num_retries: 3
per_try_timeout: 10s
retry_back_off:
base_interval: 0.1s
max_interval: 1s
clusters:
- name: grpc_cluster
type: ROUND_ROBIN
lb_policy: ROUND_ROBIN
http2_protocol_options: {}
health_checks:
- timeout: 5s
interval: 10s
unhealthy_threshold: 3
healthy_threshold: 2
grpc_health_check:
service_name: "health"
authority: "grpc-service"
circuit_breakers:
thresholds:
- priority: DEFAULT
max_connections: 100
max_pending_requests: 50
max_requests: 200
max_retries: 3
track_remaining: true
outlier_detection:
consecutive_5xx: 3
consecutive_gateway_failure: 3
interval: 30s
base_ejection_time: 30s
max_ejection_percent: 50
split_external_local_origin_errors: true
load_assignment:
cluster_name: grpc_cluster
endpoints:
- lb_endpoints:
- endpoint:
address:
socket_address:
address: 192.168.1.10
port_value: 9000
health_check_config:
port_value: 9000
- endpoint:
address:
socket_address:
address: 192.168.1.11
port_value: 9000
health_check_config:
port_value: 9000
- endpoint:
address:
socket_address:
address: 192.168.1.12
port_value: 9000
health_check_config:
port_value: 9000
Set up gRPC backend services with health endpoints
Install and configure sample gRPC services that implement the health check protocol.
sudo apt install -y golang-go
export GOPATH=/opt/go
sudo mkdir -p $GOPATH
cd $GOPATH
sudo go mod init grpc-health-server
sudo go get google.golang.org/grpc
sudo go get google.golang.org/grpc/health
sudo go get google.golang.org/grpc/health/grpc_health_v1
Create sample gRPC health server
Build a simple gRPC server with health check implementation for testing.
package main
import (
"context"
"log"
"net"
"time"
"google.golang.org/grpc"
"google.golang.org/grpc/health"
"google.golang.org/grpc/health/grpc_health_v1"
)
type server struct{}
func (s server) SayHello(ctx context.Context, req HelloRequest) (*HelloResponse, error) {
return &HelloResponse{Message: "Hello from gRPC server"}, nil
}
type HelloRequest struct{}
type HelloResponse struct{
Message string
}
func main() {
port := ":9000"
lis, err := net.Listen("tcp", port)
if err != nil {
log.Fatalf("Failed to listen: %v", err)
}
s := grpc.NewServer()
// Register health service
healthServer := health.NewServer()
grpc_health_v1.RegisterHealthServer(s, healthServer)
// Set service status to serving
healthServer.SetServingStatus("health", grpc_health_v1.HealthCheckResponse_SERVING)
log.Printf("gRPC server listening on %s", port)
if err := s.Serve(lis); err != nil {
log.Fatalf("Failed to serve: %v", err)
}
}
Configure advanced load balancing algorithms
Update the cluster configuration to use weighted round robin and least request algorithms.
# Add this to replace the grpc_cluster section in envoy.yaml
- name: grpc_cluster_weighted
type: LEAST_REQUEST
lb_policy: LEAST_REQUEST
http2_protocol_options: {}
health_checks:
- timeout: 5s
interval: 10s
unhealthy_threshold: 3
healthy_threshold: 2
grpc_health_check:
service_name: "health"
authority: "grpc-service"
event_log_path: "/var/log/envoy/health_check.log"
circuit_breakers:
thresholds:
- priority: DEFAULT
max_connections: 100
max_pending_requests: 50
max_requests: 200
max_retries: 3
track_remaining: true
- priority: HIGH
max_connections: 200
max_pending_requests: 100
max_requests: 400
max_retries: 5
outlier_detection:
consecutive_5xx: 3
consecutive_gateway_failure: 3
interval: 30s
base_ejection_time: 30s
max_ejection_percent: 50
min_health_percent: 30
split_external_local_origin_errors: true
common_lb_config:
healthy_panic_threshold:
value: 30.0
zone_aware_lb_config:
routing_enabled:
value: 100.0
min_cluster_size: 3
load_assignment:
cluster_name: grpc_cluster_weighted
endpoints:
- lb_endpoints:
- endpoint:
address:
socket_address:
address: 192.168.1.10
port_value: 9000
load_balancing_weight: 100
health_check_config:
port_value: 9000
- endpoint:
address:
socket_address:
address: 192.168.1.11
port_value: 9000
load_balancing_weight: 150
health_check_config:
port_value: 9000
- endpoint:
address:
socket_address:
address: 192.168.1.12
port_value: 9000
load_balancing_weight: 80
health_check_config:
port_value: 9000
Enable Prometheus metrics collection
Configure Envoy to export detailed metrics for monitoring and alerting.
# Add this to the admin section in envoy.yaml
stats_config:
stats_tags:
- tag_name: "cluster_name"
regex: "^cluster\\.((.+?)\\.)"
- tag_name: "virtual_host_name"
regex: "^vhost\\.((.+?)\\.)"
- tag_name: "listener_address"
regex: "^listener\\.((.+?)\\.)"
stats_matches:
- name: "circuit_breaker"
actions:
- name: "circuit_breaker_stats"
action:
"@type": type.googleapis.com/envoy.config.core.v3.HeaderValueOption
header:
key: "x-circuit-breaker"
value: "true"
stats_sinks:
- name: envoy.stat_sinks.metrics_service
typed_config:
"@type": type.googleapis.com/envoy.config.metrics.v3.MetricsServiceConfig
transport_api_version: V3
grpc_service:
envoy_grpc:
cluster_name: metrics_cluster
- name: envoy.stat_sinks.statsd
typed_config:
"@type": type.googleapis.com/envoy.extensions.stat_sinks.statsd.v3.StatsdSink
address:
socket_address:
address: 127.0.0.1
port_value: 9125
prefix: envoy
Configure SSL termination
Add TLS configuration for secure gRPC communication with certificate management.
sudo mkdir -p /etc/envoy/certs
sudo openssl req -x509 -newkey rsa:4096 -keyout /etc/envoy/certs/server.key -out /etc/envoy/certs/server.crt -days 365 -nodes -subj "/C=US/ST=State/L=City/O=Organization/CN=grpc.example.com"
sudo chown -R envoy:envoy /etc/envoy/certs
sudo chmod 600 /etc/envoy/certs/server.key
sudo chmod 644 /etc/envoy/certs/server.crt
Update configuration for SSL
Modify the listener configuration to include TLS transport socket.
# Replace the filter_chains section in envoy.yaml
filter_chains:
- filters:
- name: envoy.filters.network.http_connection_manager
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager
stat_prefix: grpc_proxy_ssl
codec_type: HTTP2
access_log:
- name: envoy.access_loggers.file
typed_config:
"@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog
path: "/var/log/envoy/ssl_access.log"
http_filters:
- name: envoy.filters.http.grpc_stats
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.grpc_stats.v3.FilterConfig
emit_filter_state: true
stats_for_all_methods: true
- name: envoy.filters.http.fault
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.fault.v3.HTTPFault
delay:
fixed_delay: 0.1s
percentage:
numerator: 1
denominator: HUNDRED
- name: envoy.filters.http.router
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
route_config:
name: ssl_local_route
virtual_hosts:
- name: grpc_ssl_backend
domains: ["*"]
routes:
- match:
prefix: "/"
grpc: {}
route:
cluster: grpc_cluster
timeout: 30s
retry_policy:
retry_on: "5xx,reset,connect-failure,refused-stream"
num_retries: 3
per_try_timeout: 10s
retry_back_off:
base_interval: 0.1s
max_interval: 2s
retry_host_predicate:
- name: envoy.retry_host_predicates.previous_hosts
typed_config:
"@type": type.googleapis.com/envoy.extensions.retry.host.previous_hosts.v3.PreviousHostsPredicate
transport_socket:
name: envoy.transport_sockets.tls
typed_config:
"@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.DownstreamTlsContext
common_tls_context:
tls_certificates:
- certificate_chain:
filename: "/etc/envoy/certs/server.crt"
private_key:
filename: "/etc/envoy/certs/server.key"
alpn_protocols: ["h2"]
Create systemd service
Set up Envoy as a system service with proper security and restart policies.
[Unit]
Description=Envoy Proxy
After=network.target
Requires=network.target
[Service]
Type=simple
User=envoy
Group=envoy
ExecStart=/usr/bin/envoy -c /etc/envoy/envoy.yaml
ExecReload=/bin/kill -HUP $MAINPID
Restart=always
RestartSec=5
LimitNOFILE=65536
StandardOutput=journal
StandardError=journal
SyslogIdentifier=envoy
Security settings
NoNewPrivileges=true
PrivateTmp=true
ProtectHome=true
ProtectSystem=strict
ReadWritePaths=/var/log/envoy /var/lib/envoy
CapabilityBoundingSet=CAP_NET_BIND_SERVICE
AmbientCapabilities=CAP_NET_BIND_SERVICE
[Install]
WantedBy=multi-user.target
Configure log rotation
Set up logrotate to manage Envoy log files and prevent disk space issues.
/var/log/envoy/*.log {
daily
rotate 30
compress
delaycompress
missingok
notifempty
create 644 envoy envoy
postrotate
/bin/systemctl reload envoy.service > /dev/null 2>&1 || true
endscript
}
Start and enable Envoy
Enable the service to start automatically and verify it's running correctly.
sudo systemctl daemon-reload
sudo systemctl enable --now envoy
sudo systemctl status envoy
Configure firewall rules
Open necessary ports for gRPC traffic and admin interface access.
sudo ufw allow 8080/tcp comment 'Envoy gRPC proxy'
sudo ufw allow from 127.0.0.1 to any port 9901 comment 'Envoy admin interface'
sudo ufw reload
Set up Prometheus monitoring integration
Configure Prometheus to scrape Envoy metrics for comprehensive observability.
# Add this job to your Prometheus configuration
- job_name: 'envoy-proxy'
static_configs:
- targets: ['localhost:9901']
metrics_path: /stats/prometheus
scrape_interval: 15s
scrape_timeout: 10s
honor_labels: true
params:
format: ['prometheus']
metric_relabel_configs:
- source_labels: [__name__]
regex: 'envoy_cluster_(.+)_circuit_breakers_(.+)_(.+)'
target_label: 'circuit_breaker_type'
replacement: '${2}'
- source_labels: [__name__]
regex: 'envoy_cluster_(.+)_health_check_(.+)'
target_label: 'health_check_type'
replacement: '${2}'
Verify your setup
Test the Envoy configuration and verify all components are working correctly.
# Check Envoy service status
sudo systemctl status envoy
Verify configuration syntax
envoy --mode validate -c /etc/envoy/envoy.yaml
Test admin interface
curl -s http://localhost:9901/stats | grep cluster
Check cluster health status
curl -s http://localhost:9901/clusters | grep health_flags
Test gRPC endpoint (requires grpcurl)
grpcurl -plaintext localhost:8080 list
Monitor circuit breaker status
curl -s http://localhost:9901/stats | grep circuit_breaker
Check health check logs
sudo tail -f /var/log/envoy/health_check.log
View access logs
sudo tail -f /var/log/envoy/access.log
Advanced circuit breaker configuration
Configure custom circuit breaker thresholds
Fine-tune circuit breaker settings based on your service capacity and requirements.
# Advanced circuit breaker configuration
circuit_breakers:
thresholds:
- priority: DEFAULT
max_connections: 100
max_pending_requests: 50
max_requests: 200
max_retries: 3
track_remaining: true
max_connection_pools: 10
- priority: HIGH
max_connections: 200
max_pending_requests: 100
max_requests: 400
max_retries: 5
track_remaining: true
max_connection_pools: 20
per_host_thresholds:
- priority: DEFAULT
max_connections: 20
max_pending_requests: 10
max_requests: 40
max_retries: 2
Configure custom retry policies
Set up intelligent retry mechanisms with backoff strategies and conditions.
# Advanced retry policy configuration
retry_policy:
retry_on: "5xx,gateway-error,connect-failure,refused-stream,reset"
num_retries: 5
per_try_timeout: 5s
per_try_idle_timeout: 2s
retry_back_off:
base_interval: 0.25s
max_interval: 5s
retry_host_predicate:
- name: envoy.retry_host_predicates.previous_hosts
typed_config:
"@type": type.googleapis.com/envoy.extensions.retry.host.previous_hosts.v3.PreviousHostsPredicate
- name: envoy.retry_host_predicates.omit_canary_hosts
typed_config:
"@type": type.googleapis.com/envoy.extensions.retry.host.omit_canary_hosts.v3.OmitCanaryHostsPredicate
retry_priority:
name: envoy.retry_priorities.previous_priorities
typed_config:
"@type": type.googleapis.com/envoy.extensions.retry.priority.previous_priorities.v3.PreviousPrioritiesConfig
update_frequency: 2
retriable_status_codes: [500, 502, 503, 504]
retriable_headers:
- name: "x-retry"
string_match:
exact: "true"
Production security hardening
Enable access logging with security headers
Configure comprehensive access logging for security monitoring and debugging.
# Add security headers and enhanced logging
response_headers_to_add:
- header:
key: "X-Frame-Options"
value: "DENY"
append: false
- header:
key: "X-Content-Type-Options"
value: "nosniff"
append: false
- header:
key: "X-XSS-Protection"
value: "1; mode=block"
append: false
- header:
key: "Strict-Transport-Security"
value: "max-age=31536000; includeSubDomains"
append: false
- header:
key: "Content-Security-Policy"
value: "default-src 'self'"
append: false
request_headers_to_remove: ["server", "x-powered-by"]
access_log:
- name: envoy.access_loggers.file
typed_config:
"@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog
path: "/var/log/envoy/security.log"
format: |
[%START_TIME%] "%REQ(:METHOD)% %REQ(X-ENVOY-ORIGINAL-PATH?:PATH)% %PROTOCOL%"
%RESPONSE_CODE% %RESPONSE_FLAGS% %CONNECTION_TERMINATION_DETAILS%
"%REQ(USER-AGENT)%" "%REQ(X-REQUEST-ID)%" "%REQ(:AUTHORITY)%"
"%UPSTREAM_HOST%" %UPSTREAM_CLUSTER% %UPSTREAM_LOCAL_ADDRESS%
%DOWNSTREAM_LOCAL_ADDRESS% %DOWNSTREAM_REMOTE_ADDRESS%
rx_bytes=%BYTES_RECEIVED% tx_bytes=%BYTES_SENT% duration=%DURATION%ms
Configure rate limiting
Implement rate limiting to protect against abuse and ensure fair resource usage.
# Rate limiting configuration
http_filters:
- name: envoy.filters.http.local_ratelimit
typed_config:
"@type": type.googleapis.com/udpa.type.v1.TypedStruct
type_url: type.googleapis.com/envoy.extensions.filters.http.local_ratelimit.v3.LocalRateLimit
value:
stat_prefix: http_local_rate_limiter
token_bucket:
max_tokens: 1000
tokens_per_fill: 100
fill_interval: 1s
filter_enabled:
runtime_key: local_rate_limit_enabled
default_value:
numerator: 100
denominator: HUNDRED
filter_enforced:
runtime_key: local_rate_limit_enforced
default_value:
numerator: 100
denominator: HUNDRED
response_headers_to_add:
- append: false
header:
key: x-local-rate-limit
value: 'true'
local_rate_limit_per_downstream_connection: false
enable_x_ratelimit_headers: DRAFT_VERSION_03
Monitor Envoy performance
Set up Grafana dashboards
Create comprehensive dashboards for monitoring gRPC performance and circuit breaker status. You can integrate this with existing Grafana dashboard configurations for a complete monitoring solution.
{
"dashboard": {
"id": null,
"title": "Envoy gRPC Load Balancer",
"tags": ["envoy", "grpc", "load-balancer"],
"style": "dark",
"timezone": "browser",
"panels": [
{
"title": "Request Rate",
"type": "graph",
"targets": [
{
"expr": "rate(envoy_cluster_upstream_rq_total[5m])",
"legendFormat": "{{cluster_name}}"
}
]
},
{
"title": "Circuit Breaker Status",
"type": "stat",
"targets": [
{
"expr": "envoy_cluster_circuit_breakers_default_remaining_cx",
"legendFormat": "Connections Remaining"
},
{
"expr": "envoy_cluster_circuit_breakers_default_remaining_rq",
"legendFormat": "Requests Remaining"
}
]
},
{
"title": "Health Check Status",
"type": "stat",
"targets": [
{
"expr": "envoy_cluster_health_check_healthy",
"legendFormat": "{{cluster_name}} Healthy"
}
]
},
{
"title": "Response Times",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.50, rate(envoy_cluster_upstream_rq_time_bucket[5m]))",
"legendFormat": "p50"
},
{
"expr": "histogram_quantile(0.95, rate(envoy_cluster_upstream_rq_time_bucket[5m]))",
"legendFormat": "p95"
}
]
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"refresh": "5s"
}
}
Configure Prometheus alerts
Set up alerts for circuit breaker trips and health check failures.
groups:
- name: envoy_grpc
rules:
- alert: EnvoyCircuitBreakerOpen
expr: envoy_cluster_circuit_breakers_default_remaining_cx < 10
for: 30s
labels:
severity: warning
annotations:
summary: "Envoy circuit breaker nearly triggered"
description: "Circuit breaker for cluster {{ $labels.cluster_name }} has less than 10 connections remaining"
- alert: EnvoyHealthCheckFailed
expr: envoy_cluster_health_check_healthy == 0
for: 60s
labels:
severity: critical
annotations:
summary: "Envoy health check failure"
description: "All health checks failed for cluster {{ $labels.cluster_name }}"
- alert: EnvoyHighErrorRate
expr: rate(envoy_cluster_upstream_rq_xx{envoy_response_code_class="5"}[5m]) / rate(envoy_cluster_upstream_rq_total[5m]) > 0.1
for: 2m
labels:
severity: warning
annotations:
summary: "High error rate in Envoy cluster"
description: "Error rate is {{ $value | humanizePercentage }} for cluster {{ $labels.cluster_name }}"
- alert: EnvoyHighLatency
expr: histogram_quantile(0.95, rate(envoy_cluster_upstream_rq_time_bucket[5m])) > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "High latency in Envoy cluster"
description: "95th percentile latency is {{ $value }}ms for cluster {{ $labels.cluster_name }}"
- alert: EnvoyUpstreamConnectionFailure
expr: rate(envoy_cluster_upstream_cx_connect_fail[5m]) > 0.1
for: 2m
labels:
severity: critical
annotations:
summary: "Envoy upstream connection failures"
description: "Connection failure rate is {{ $value | humanize }} per second for cluster {{ $labels.cluster_name }}"
Common issues
| Symptom | Cause | Fix |
|---|---|---|
| Envoy won't start | Configuration syntax error | envoy --mode validate -c /etc/envoy/envoy.yaml to check config |
| Health checks failing | Backend services not implementing health check protocol | Ensure gRPC services implement grpc.health.v1.Health service |
| Circuit breaker always open | Thresholds set too low for traffic volume | Increase max_connections and max_requests values |
| SSL handshake failures | Certificate path or permissions incorrect | Verify cert paths and chown envoy:envoy /etc/envoy/certs/* |
| High memory usage | Too many connections or large buffers | Tune buffer_limit_bytes and connection pool settings |
| Metrics not appearing | Prometheus scrape configuration incorrect | Check /stats/prometheus endpoint and Prometheus target status |
| Load balancing uneven | Health check or endpoint weights misconfigured | Verify endpoint weights and health status via admin interface |
Next steps
- Configure Istio distributed tracing with Jaeger and Zipkin for end-to-end observability
- Implement Kubernetes network policies with Calico for enhanced security
- Set up Envoy with Consul service discovery for dynamic backend management
- Implement gRPC authentication with JWT tokens for secure API access
- Configure Envoy multi-cluster federation for cross-region load balancing
Running this in production?
Automated install script
Run this to automate the entire setup
#!/usr/bin/env bash
set -euo pipefail
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
# Default values
GRPC_BACKENDS="${1:-127.0.0.1:50051,127.0.0.1:50052}"
LISTEN_PORT="${2:-8080}"
ADMIN_PORT="${3:-9901}"
usage() {
echo "Usage: $0 [grpc_backends] [listen_port] [admin_port]"
echo " grpc_backends: Comma-separated list of gRPC backends (default: 127.0.0.1:50051,127.0.0.1:50052)"
echo " listen_port: Port for Envoy to listen on (default: 8080)"
echo " admin_port: Admin interface port (default: 9901)"
exit 1
}
log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
cleanup() {
if [ "${CLEANUP_NEEDED:-}" = "true" ]; then
log_warn "Installation failed, cleaning up..."
systemctl stop envoy 2>/dev/null || true
systemctl disable envoy 2>/dev/null || true
userdel -r envoy 2>/dev/null || true
rm -rf /etc/envoy /var/log/envoy /etc/systemd/system/envoy.service 2>/dev/null || true
fi
}
trap cleanup ERR
# Check if running as root
if [ "$EUID" -ne 0 ]; then
log_error "This script must be run as root"
exit 1
fi
# Detect distribution
echo "[1/10] Detecting distribution..."
if [ -f /etc/os-release ]; then
. /etc/os-release
case "$ID" in
ubuntu|debian)
PKG_MGR="apt"
PKG_UPDATE="apt update"
PKG_INSTALL="apt install -y"
FIREWALL_CMD="ufw"
;;
almalinux|rocky|centos|rhel|ol)
PKG_MGR="dnf"
PKG_UPDATE="dnf update -y"
PKG_INSTALL="dnf install -y"
FIREWALL_CMD="firewall-cmd"
;;
fedora)
PKG_MGR="dnf"
PKG_UPDATE="dnf update -y"
PKG_INSTALL="dnf install -y"
FIREWALL_CMD="firewall-cmd"
;;
amzn)
PKG_MGR="yum"
PKG_UPDATE="yum update -y"
PKG_INSTALL="yum install -y"
FIREWALL_CMD="firewall-cmd"
;;
*)
log_error "Unsupported distribution: $ID"
exit 1
;;
esac
log_info "Detected $PRETTY_NAME ($ID)"
else
log_error "Cannot detect distribution"
exit 1
fi
CLEANUP_NEEDED="true"
# Install prerequisites
echo "[2/10] Installing prerequisites..."
$PKG_UPDATE >/dev/null
$PKG_INSTALL curl gnupg2 lsb-release >/dev/null
# Install Envoy Proxy
echo "[3/10] Installing Envoy Proxy..."
if [ "$PKG_MGR" = "apt" ]; then
curl -sL 'https://deb.dl.getenvoy.io/public/gpg.8115BA8E629CC074.key' | gpg --dearmor -o /usr/share/keyrings/getenvoy-keyring.gpg
echo "deb [arch=amd64 signed-by=/usr/share/keyrings/getenvoy-keyring.gpg] https://deb.dl.getenvoy.io/public/deb/ubuntu $(lsb_release -cs) main" > /etc/apt/sources.list.d/getenvoy.list
apt update >/dev/null
apt install -y getenvoy-envoy
else
curl -sL 'https://rpm.dl.getenvoy.io/public/gpg.CF716AF503183491.key' | rpm --import -
cat > /etc/yum.repos.d/getenvoy.repo <<EOF
[getenvoy]
name=GetEnvoy
baseurl=https://rpm.dl.getenvoy.io/public/rpm/el/8/\$basearch
enabled=1
gpgcheck=1
gpgkey=https://rpm.dl.getenvoy.io/public/gpg.CF716AF503183491.key
EOF
$PKG_INSTALL getenvoy-envoy
fi
# Create Envoy user and directories
echo "[4/10] Creating Envoy user and directories..."
useradd --system --shell /bin/false --home-dir /var/lib/envoy --create-home envoy 2>/dev/null || true
mkdir -p /etc/envoy /var/log/envoy
chown -R envoy:envoy /var/lib/envoy /var/log/envoy
chmod 755 /etc/envoy
chmod 755 /var/log/envoy
# Generate Envoy configuration
echo "[5/10] Generating Envoy configuration..."
cat > /etc/envoy/envoy.yaml <<EOF
admin:
address:
socket_address:
address: 127.0.0.1
port_value: ${ADMIN_PORT}
access_log:
- name: envoy.access_loggers.file
typed_config:
"@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog
path: "/var/log/envoy/admin.log"
static_resources:
listeners:
- name: grpc_listener
address:
socket_address:
address: 0.0.0.0
port_value: ${LISTEN_PORT}
filter_chains:
- filters:
- name: envoy.filters.network.http_connection_manager
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager
stat_prefix: grpc_proxy
codec_type: HTTP2
access_log:
- name: envoy.access_loggers.file
typed_config:
"@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog
path: "/var/log/envoy/access.log"
http_filters:
- name: envoy.filters.http.grpc_stats
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.grpc_stats.v3.FilterConfig
emit_filter_state: true
- name: envoy.filters.http.router
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
route_config:
name: local_route
virtual_hosts:
- name: grpc_backend
domains: ["*"]
routes:
- match:
prefix: "/"
route:
cluster: grpc_cluster
timeout: 30s
retry_policy:
retry_on: "5xx,reset,connect-failure,refused-stream"
num_retries: 3
per_try_timeout: 10s
clusters:
- name: grpc_cluster
connect_timeout: 5s
type: STRICT_DNS
http2_protocol_options: {}
lb_policy: ROUND_ROBIN
health_checks:
- timeout: 5s
interval: 10s
unhealthy_threshold: 3
healthy_threshold: 2
grpc_health_check:
service_name: ""
circuit_breakers:
thresholds:
- priority: DEFAULT
max_connections: 100
max_pending_requests: 50
max_requests: 200
max_retries: 3
outlier_detection:
consecutive_5xx: 5
interval: 30s
base_ejection_time: 30s
max_ejection_percent: 50
load_assignment:
cluster_name: grpc_cluster
endpoints:
- lb_endpoints:
EOF
# Add backend endpoints
IFS=',' read -ra BACKENDS <<< "$GRPC_BACKENDS"
for backend in "${BACKENDS[@]}"; do
IFS=':' read -ra HOST_PORT <<< "$backend"
cat >> /etc/envoy/envoy.yaml <<EOF
- endpoint:
address:
socket_address:
address: ${HOST_PORT[0]}
port_value: ${HOST_PORT[1]}
EOF
done
chown envoy:envoy /etc/envoy/envoy.yaml
chmod 644 /etc/envoy/envoy.yaml
# Create systemd service
echo "[6/10] Creating systemd service..."
cat > /etc/systemd/system/envoy.service <<EOF
[Unit]
Description=Envoy Proxy
Documentation=https://www.envoyproxy.io/
After=network.target
Wants=network.target
[Service]
Type=simple
User=envoy
Group=envoy
ExecStart=/usr/bin/envoy -c /etc/envoy/envoy.yaml
ExecReload=/bin/kill -HUP \$MAINPID
KillMode=mixed
KillSignal=SIGINT
Restart=on-failure
RestartSec=5
LimitNOFILE=65536
[Install]
WantedBy=multi-user.target
EOF
chmod 644 /etc/systemd/system/envoy.service
systemctl daemon-reload
# Configure firewall
echo "[7/10] Configuring firewall..."
if command -v ufw >/dev/null 2>&1; then
ufw allow ${LISTEN_PORT}/tcp >/dev/null 2>&1 || true
elif command -v firewall-cmd >/dev/null 2>&1; then
firewall-cmd --permanent --add-port=${LISTEN_PORT}/tcp >/dev/null 2>&1 || true
firewall-cmd --reload >/dev/null 2>&1 || true
fi
# Configure SELinux if enabled
echo "[8/10] Configuring SELinux..."
if command -v getenforce >/dev/null 2>&1 && [ "$(getenforce)" = "Enforcing" ]; then
setsebool -P httpd_can_network_connect 1 >/dev/null 2>&1 || true
semanage port -a -t http_port_t -p tcp ${LISTEN_PORT} >/dev/null 2>&1 || true
fi
# Start and enable service
echo "[9/10] Starting Envoy service..."
systemctl enable envoy
systemctl start envoy
# Verify installation
echo "[10/10] Verifying installation..."
sleep 3
if systemctl is-active envoy >/dev/null 2>&1; then
log_info "Envoy service is running"
else
log_error "Envoy service failed to start"
systemctl status envoy
exit 1
fi
if curl -s http://127.0.0.1:${ADMIN_PORT}/ready >/dev/null; then
log_info "Envoy admin interface is accessible"
else
log_warn "Envoy admin interface is not ready yet"
fi
CLEANUP_NEEDED="false"
log_info "Envoy Proxy installation completed successfully!"
echo ""
echo "Configuration:"
echo " Listen port: ${LISTEN_PORT}"
echo " Admin port: ${ADMIN_PORT}"
echo " Backends: ${GRPC_BACKENDS}"
echo " Config file: /etc/envoy/envoy.yaml"
echo " Logs: /var/log/envoy/"
echo ""
echo "Usage:"
echo " systemctl start|stop|restart envoy"
echo " Admin interface: http://127.0.0.1:${ADMIN_PORT}"
echo " Health check: curl http://127.0.0.1:${ADMIN_PORT}/ready"
Review the script before running. Execute with: bash install.sh