Deploy Apache Spark 3.5 on Kubernetes with automatic cluster scaling, dynamic resource allocation, and comprehensive monitoring for production data processing workloads.
Prerequisites
- Kubernetes cluster v1.26+
- kubectl with admin access
- Helm 3.8+
- 8GB RAM per node minimum
What this solves
This tutorial helps you deploy Apache Spark 3.5 on Kubernetes with automatic scaling capabilities for data processing workloads. You'll configure cluster autoscaling to handle varying computational demands, implement dynamic resource allocation for Spark executors, and set up comprehensive monitoring with Prometheus and Grafana.
Prerequisites
- Kubernetes cluster (v1.26+) with at least 3 worker nodes
- kubectl configured with cluster admin access
- Helm 3.8+ installed
- At least 8GB RAM and 4 CPU cores per node
Step-by-step installation
Update system packages
Start by updating your system packages to ensure you have the latest versions.
sudo apt update && sudo apt upgrade -y
sudo apt install -y curl wget unzip
Install Kubernetes cluster components
Install kubectl and verify your cluster connection before proceeding with Spark deployment.
curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
kubectl version --client
Install Helm package manager
Helm will manage the Spark Operator and other Kubernetes components required for our setup.
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
helm version
helm repo add spark-operator https://googlecloudplatform.github.io/spark-on-k8s-operator
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo add grafana https://grafana.github.io/helm-charts
helm repo update
Create dedicated namespace
Create a dedicated namespace for Spark workloads to isolate resources and apply security policies.
kubectl create namespace spark-system
kubectl create namespace spark-jobs
kubectl create namespace monitoring
Configure cluster autoscaler
Deploy the Kubernetes cluster autoscaler to automatically scale nodes based on resource demands.
apiVersion: apps/v1
kind: Deployment
metadata:
name: cluster-autoscaler
namespace: kube-system
labels:
app: cluster-autoscaler
spec:
selector:
matchLabels:
app: cluster-autoscaler
replicas: 1
template:
metadata:
labels:
app: cluster-autoscaler
spec:
priorityClassName: system-cluster-critical
securityContext:
runAsNonRoot: true
runAsUser: 65534
fsGroup: 65534
serviceAccountName: cluster-autoscaler
containers:
- image: registry.k8s.io/autoscaling/cluster-autoscaler:v1.28.0
name: cluster-autoscaler
resources:
limits:
cpu: 100m
memory: 600Mi
requests:
cpu: 100m
memory: 600Mi
command:
- ./cluster-autoscaler
- --v=4
- --stderrthreshold=info
- --cloud-provider=aws
- --skip-nodes-with-local-storage=false
- --expander=least-waste
- --node-group-auto-discovery=asg:tag=k8s.io/cluster-autoscaler/enabled,k8s.io/cluster-autoscaler/spark-cluster
- --balance-similar-node-groups
- --skip-nodes-with-system-pods=false
- --scale-down-enabled=true
- --scale-down-delay-after-add=10m
- --scale-down-unneeded-time=10m
env:
- name: AWS_REGION
value: us-west-2
volumeMounts:
- name: ssl-certs
mountPath: /etc/ssl/certs/ca-certificates.crt
readOnly: true
imagePullPolicy: "Always"
volumes:
- name: ssl-certs
hostPath:
path: "/etc/ssl/certs/ca-bundle.crt"
nodeSelector:
kubernetes.io/os: linux
Create cluster autoscaler service account
Configure RBAC permissions for the cluster autoscaler to manage node groups.
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
k8s-addon: cluster-autoscaler.addons.k8s.io
k8s-app: cluster-autoscaler
name: cluster-autoscaler
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: cluster-autoscaler
labels:
k8s-addon: cluster-autoscaler.addons.k8s.io
k8s-app: cluster-autoscaler
rules:
- apiGroups: [""]
resources: ["events", "endpoints"]
verbs: ["create", "patch"]
- apiGroups: [""]
resources: ["pods/eviction"]
verbs: ["create"]
- apiGroups: [""]
resources: ["pods/status"]
verbs: ["update"]
- apiGroups: [""]
resources: ["endpoints"]
resourceNames: ["cluster-autoscaler"]
verbs: ["get", "update"]
- apiGroups: [""]
resources: ["nodes"]
verbs: ["watch", "list", "get", "update"]
- apiGroups: [""]
resources: ["pods", "services", "replicationcontrollers", "persistentvolumeclaims", "persistentvolumes"]
verbs: ["watch", "list", "get"]
- apiGroups: ["extensions"]
resources: ["replicasets", "daemonsets"]
verbs: ["watch", "list", "get"]
- apiGroups: ["policy"]
resources: ["poddisruptionbudgets"]
verbs: ["watch", "list"]
- apiGroups: ["apps"]
resources: ["statefulsets", "replicasets", "daemonsets"]
verbs: ["watch", "list", "get"]
- apiGroups: ["storage.k8s.io"]
resources: ["storageclasses", "csinodes"]
verbs: ["watch", "list", "get"]
- apiGroups: ["batch", "extensions"]
resources: ["jobs"]
verbs: ["get", "list", "watch", "patch"]
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
verbs: ["create"]
- apiGroups: ["coordination.k8s.io"]
resourceNames: ["cluster-autoscaler"]
resources: ["leases"]
verbs: ["get", "update"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: cluster-autoscaler
labels:
k8s-addon: cluster-autoscaler.addons.k8s.io
k8s-app: cluster-autoscaler
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: cluster-autoscaler
subjects:
- kind: ServiceAccount
name: cluster-autoscaler
namespace: kube-system
Deploy cluster autoscaler
Apply the cluster autoscaler configuration to enable automatic node scaling.
kubectl apply -f /tmp/cluster-autoscaler-rbac.yaml
kubectl apply -f /tmp/cluster-autoscaler.yaml
kubectl -n kube-system logs -l app=cluster-autoscaler
Install Spark Operator
Deploy the Spark Operator using Helm to manage Spark applications on Kubernetes.
helm install spark-operator spark-operator/spark-operator \
--namespace spark-system \
--set webhook.enable=true \
--set resources.limits.cpu=200m \
--set resources.limits.memory=512Mi \
--set sparkJobNamespace=spark-jobs \
--set enableBatchScheduler=true \
--set volcano.enabled=false \
--wait
Configure Spark service account
Create service accounts and RBAC permissions for Spark driver and executor pods.
apiVersion: v1
kind: ServiceAccount
metadata:
name: spark-driver
namespace: spark-jobs
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
namespace: spark-jobs
name: spark-role
rules:
- apiGroups: [""]
resources: ["pods"]
verbs: ["*"]
- apiGroups: [""]
resources: ["services"]
verbs: ["*"]
- apiGroups: [""]
resources: ["configmaps"]
verbs: ["*"]
- apiGroups: [""]
resources: ["persistentvolumeclaims"]
verbs: ["*"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: spark-role-binding
namespace: spark-jobs
subjects:
- kind: ServiceAccount
name: spark-driver
namespace: spark-jobs
roleRef:
kind: Role
name: spark-role
apiGroup: rbac.authorization.k8s.io
kubectl apply -f /tmp/spark-rbac.yaml
Create Spark configuration
Configure Spark with dynamic allocation and Kubernetes-specific settings for optimal performance.
apiVersion: v1
kind: ConfigMap
metadata:
name: spark-config
namespace: spark-jobs
data:
spark-defaults.conf: |
# Dynamic allocation settings
spark.dynamicAllocation.enabled=true
spark.dynamicAllocation.shuffleTracking.enabled=true
spark.dynamicAllocation.minExecutors=1
spark.dynamicAllocation.maxExecutors=50
spark.dynamicAllocation.initialExecutors=2
spark.dynamicAllocation.executorIdleTimeout=30s
spark.dynamicAllocation.cachedExecutorIdleTimeout=60s
spark.dynamicAllocation.schedulerBacklogTimeout=1s
# Kubernetes settings
spark.kubernetes.executor.deleteOnTermination=true
spark.kubernetes.driver.deleteOnTermination=true
spark.kubernetes.executor.podNamePrefix=spark-exec
# Performance tuning
spark.sql.adaptive.enabled=true
spark.sql.adaptive.coalescePartitions.enabled=true
spark.sql.adaptive.skewJoin.enabled=true
spark.serializer=org.apache.spark.serializer.KryoSerializer
# Resource management
spark.kubernetes.executor.request.cores=0.5
spark.kubernetes.executor.limit.cores=2
spark.executor.memory=2g
spark.driver.memory=1g
# Monitoring
spark.ui.prometheus.enabled=true
spark.sql.streaming.metricsEnabled=true
spark.eventLog.enabled=true
spark.eventLog.dir=/tmp/spark-events
kubectl apply -f /tmp/spark-config.yaml
Deploy Horizontal Pod Autoscaler
Configure HPA to automatically scale Spark application pods based on CPU and memory usage.
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: spark-driver-hpa
namespace: spark-jobs
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: spark-driver
minReplicas: 1
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
behavior:
scaleDown:
stabilizationWindowSeconds: 300
policies:
- type: Percent
value: 50
periodSeconds: 60
scaleUp:
stabilizationWindowSeconds: 60
policies:
- type: Percent
value: 100
periodSeconds: 15
- type: Pods
value: 2
periodSeconds: 60
kubectl apply -f /tmp/spark-hpa.yaml
Install Prometheus for monitoring
Deploy Prometheus to collect metrics from Spark applications and Kubernetes cluster.
helm install prometheus prometheus-community/kube-prometheus-stack \
--namespace monitoring \
--set grafana.enabled=true \
--set prometheus.prometheusSpec.retention=30d \
--set prometheus.prometheusSpec.storageSpec.volumeClaimTemplate.spec.resources.requests.storage=50Gi \
--set grafana.persistence.enabled=true \
--set grafana.persistence.size=10Gi \
--set alertmanager.persistence.enabled=true \
--wait
Configure Spark metrics collection
Set up Prometheus ServiceMonitor to collect metrics from Spark applications.
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: spark-metrics
namespace: monitoring
labels:
app: spark
release: prometheus
spec:
namespaceSelector:
matchNames:
- spark-jobs
selector:
matchLabels:
spark-role: driver
endpoints:
- port: metrics
path: /metrics
interval: 30s
scrapeTimeout: 10s
---
apiVersion: v1
kind: Service
metadata:
name: spark-driver-metrics
namespace: spark-jobs
labels:
spark-role: driver
spec:
ports:
- name: metrics
port: 4040
targetPort: 4040
selector:
spark-role: driver
kubectl apply -f /tmp/spark-servicemonitor.yaml
Create sample Spark application
Deploy a test Spark application to verify the dynamic scaling configuration.
apiVersion: sparkoperator.k8s.io/v1beta2
kind: SparkApplication
metadata:
name: spark-pi
namespace: spark-jobs
spec:
type: Scala
mode: cluster
image: apache/spark:3.5.0
imagePullPolicy: IfNotPresent
mainClass: org.apache.spark.examples.SparkPi
mainApplicationFile: local:///opt/spark/examples/jars/spark-examples_2.12-3.5.0.jar
arguments:
- "1000"
sparkVersion: 3.5.0
restartPolicy:
type: Never
driver:
serviceAccount: spark-driver
cores: 1
coreLimit: 1200m
memory: 1g
labels:
spark-role: driver
version: 3.5.0
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "4040"
prometheus.io/path: "/metrics"
executor:
cores: 1
instances: 2
memory: 1g
labels:
spark-role: executor
version: 3.5.0
dynamicAllocation:
enabled: true
initialExecutors: 2
minExecutors: 1
maxExecutors: 10
targetExecutors: 5
monitoring:
exposeDriverMetrics: true
exposeExecutorMetrics: true
prometheus:
jmxExporterJar: "/opt/spark/jars/jmx_prometheus_javaagent-0.17.2.jar"
port: 8090
kubectl apply -f /tmp/spark-pi-app.yaml
Configure Grafana dashboards
Import pre-built Grafana dashboards for Spark and Kubernetes monitoring.
# Get Grafana admin password
kubectl get secret -n monitoring prometheus-grafana -o jsonpath="{.data.admin-password}" | base64 --decode
echo
Port forward to access Grafana
kubectl port-forward -n monitoring svc/prometheus-grafana 3000:80 &
echo "Grafana accessible at http://localhost:3000"
Configure dynamic allocation policies
Create resource quotas
Set namespace-level resource quotas to prevent resource exhaustion during scaling events.
apiVersion: v1
kind: ResourceQuota
metadata:
name: spark-quota
namespace: spark-jobs
spec:
hard:
requests.cpu: "100"
requests.memory: 200Gi
limits.cpu: "200"
limits.memory: 400Gi
persistentvolumeclaims: "10"
pods: "100"
---
apiVersion: v1
kind: LimitRange
metadata:
name: spark-limits
namespace: spark-jobs
spec:
limits:
- default:
cpu: 2
memory: 4Gi
defaultRequest:
cpu: 100m
memory: 512Mi
type: Container
- max:
cpu: 8
memory: 16Gi
min:
cpu: 100m
memory: 128Mi
type: Container
kubectl apply -f /tmp/resource-quota.yaml
Configure pod disruption budgets
Set pod disruption budgets to maintain application availability during node scaling events.
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
name: spark-driver-pdb
namespace: spark-jobs
spec:
minAvailable: 1
selector:
matchLabels:
spark-role: driver
---
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
name: spark-executor-pdb
namespace: spark-jobs
spec:
maxUnavailable: 50%
selector:
matchLabels:
spark-role: executor
kubectl apply -f /tmp/pod-disruption-budget.yaml
Production optimization and security
Enable network policies
Implement network segmentation for enhanced security between Spark components.
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
name: spark-network-policy
namespace: spark-jobs
spec:
podSelector:
matchLabels:
app: spark
policyTypes:
- Ingress
- Egress
ingress:
- from:
- namespaceSelector:
matchLabels:
name: spark-jobs
- namespaceSelector:
matchLabels:
name: monitoring
ports:
- protocol: TCP
port: 4040
- protocol: TCP
port: 7077
- protocol: TCP
port: 8080
egress:
- to:
- namespaceSelector:
matchLabels:
name: spark-jobs
- to: []
ports:
- protocol: TCP
port: 53
- protocol: UDP
port: 53
- to: []
ports:
- protocol: TCP
port: 443
- protocol: TCP
port: 80
kubectl apply -f /tmp/network-policy.yaml
Configure security contexts
Apply security contexts and pod security standards to harden Spark deployments.
apiVersion: v1
kind: Namespace
metadata:
name: spark-jobs
labels:
pod-security.kubernetes.io/enforce: restricted
pod-security.kubernetes.io/audit: restricted
pod-security.kubernetes.io/warn: restricted
---
apiVersion: v1
kind: SecurityContextConstraints
metadata:
name: spark-scc
allowHostDirVolumePlugin: false
allowHostIPC: false
allowHostNetwork: false
allowHostPID: false
allowPrivilegedContainer: false
allowedCapabilities: []
defaultAddCapabilities: []
requiredDropCapabilities:
- ALL
fsGroup:
type: RunAsAny
runAsUser:
type: MustRunAsNonRoot
seLinuxContext:
type: MustRunAs
supplementalGroups:
type: RunAsAny
volumes:
- configMap
- downwardAPI
- emptyDir
- persistentVolumeClaim
- projected
- secret
kubectl apply -f /tmp/pod-security-policy.yaml
Verify your setup
Check that all components are running correctly and scaling policies are active.
# Verify cluster autoscaler
kubectl -n kube-system get pods -l app=cluster-autoscaler
kubectl -n kube-system logs -l app=cluster-autoscaler --tail=50
Check Spark Operator
kubectl -n spark-system get pods
kubectl -n spark-system logs -l app.kubernetes.io/name=spark-operator
Verify Spark application
kubectl -n spark-jobs get sparkapplications
kubectl -n spark-jobs describe sparkapplication spark-pi
Check HPA status
kubectl -n spark-jobs get hpa
kubectl top nodes
kubectl top pods -n spark-jobs
Verify monitoring
kubectl -n monitoring get pods
curl -k http://localhost:3000
Common issues
| Symptom | Cause | Fix |
|---|---|---|
| Spark applications stuck in pending | Insufficient cluster resources | Check node capacity: kubectl describe nodes |
| Executors not scaling dynamically | Dynamic allocation disabled | Verify config: kubectl -n spark-jobs get configmap spark-config -o yaml |
| Cluster autoscaler not adding nodes | Missing node group tags | Check AWS ASG tags include cluster-autoscaler annotations |
| Metrics not appearing in Grafana | ServiceMonitor misconfigured | Check targets in Prometheus UI: kubectl port-forward -n monitoring svc/prometheus-operated 9090 |
| Pod disruption during scaling | PodDisruptionBudget too restrictive | Review PDB settings: kubectl -n spark-jobs get pdb |
Next steps
- Set up Spark Streaming with Kafka and Delta Lake for real-time analytics to process streaming data
- Implement Spark SQL performance optimization with Catalyst optimizer for query tuning
- Implement Kubernetes secrets management with HashiCorp Vault integration for secure configuration
- Configure Prometheus monitoring for ArgoCD for GitOps workflow observability
- Setup Kubernetes Ingress NGINX with cert-manager for external access
Running this in production?
Automated install script
Run this to automate the entire setup
#!/usr/bin/env bash
set -euo pipefail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
# Default configuration
CLUSTER_NAME="${1:-spark-cluster}"
AWS_REGION="${2:-us-west-2}"
NAMESPACE_SPARK="spark-system"
NAMESPACE_JOBS="spark-jobs"
NAMESPACE_MONITORING="monitoring"
usage() {
echo "Usage: $0 [cluster-name] [aws-region]"
echo " cluster-name: Kubernetes cluster name (default: spark-cluster)"
echo " aws-region: AWS region for autoscaler (default: us-west-2)"
exit 1
}
log() {
echo -e "${GREEN}[INFO]${NC} $1"
}
warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
error() {
echo -e "${RED}[ERROR]${NC} $1"
exit 1
}
cleanup() {
warn "Installation failed. Cleaning up..."
kubectl delete namespace $NAMESPACE_SPARK $NAMESPACE_JOBS $NAMESPACE_MONITORING 2>/dev/null || true
helm uninstall spark-operator prometheus grafana 2>/dev/null || true
}
trap cleanup ERR
detect_distro() {
if [ -f /etc/os-release ]; then
. /etc/os-release
case "$ID" in
ubuntu|debian)
PKG_MGR="apt"
PKG_UPDATE="apt update && apt upgrade -y"
PKG_INSTALL="apt install -y"
;;
almalinux|rocky|centos|rhel|ol|fedora)
PKG_MGR="dnf"
PKG_UPDATE="dnf update -y"
PKG_INSTALL="dnf install -y"
;;
amzn)
PKG_MGR="yum"
PKG_UPDATE="yum update -y"
PKG_INSTALL="yum install -y"
;;
*)
error "Unsupported distro: $ID"
;;
esac
else
error "Cannot detect OS distribution"
fi
log "Detected OS: $ID using $PKG_MGR"
}
check_prerequisites() {
log "Checking prerequisites..."
if [[ $EUID -eq 0 ]]; then
error "This script should not be run as root. Use a user with sudo privileges."
fi
if ! command -v sudo &> /dev/null; then
error "sudo is required but not installed"
fi
if ! kubectl cluster-info &> /dev/null; then
error "kubectl not configured or cluster not accessible"
fi
local nodes=$(kubectl get nodes --no-headers | wc -l)
if [[ $nodes -lt 3 ]]; then
error "Cluster must have at least 3 worker nodes (found: $nodes)"
fi
}
update_system() {
log "[1/8] Updating system packages..."
sudo $PKG_UPDATE
sudo $PKG_INSTALL curl wget unzip
}
install_kubectl() {
log "[2/8] Installing kubectl..."
if command -v kubectl &> /dev/null; then
log "kubectl already installed"
return
fi
curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
rm -f kubectl
kubectl version --client
}
install_helm() {
log "[3/8] Installing Helm..."
if command -v helm &> /dev/null; then
log "Helm already installed"
else
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
fi
helm repo add spark-operator https://googlecloudplatform.github.io/spark-on-k8s-operator
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo add grafana https://grafana.github.io/helm-charts
helm repo update
}
create_namespaces() {
log "[4/8] Creating Kubernetes namespaces..."
kubectl create namespace $NAMESPACE_SPARK --dry-run=client -o yaml | kubectl apply -f -
kubectl create namespace $NAMESPACE_JOBS --dry-run=client -o yaml | kubectl apply -f -
kubectl create namespace $NAMESPACE_MONITORING --dry-run=client -o yaml | kubectl apply -f -
}
deploy_cluster_autoscaler() {
log "[5/8] Deploying cluster autoscaler..."
cat <<EOF | kubectl apply -f -
apiVersion: v1
kind: ServiceAccount
metadata:
name: cluster-autoscaler
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: cluster-autoscaler
rules:
- apiGroups: [""]
resources: ["events", "endpoints"]
verbs: ["create", "patch"]
- apiGroups: [""]
resources: ["pods/eviction"]
verbs: ["create"]
- apiGroups: [""]
resources: ["pods/status"]
verbs: ["update"]
- apiGroups: [""]
resources: ["endpoints"]
resourceNames: ["cluster-autoscaler"]
verbs: ["get", "update"]
- apiGroups: [""]
resources: ["nodes"]
verbs: ["watch", "list", "get", "update"]
- apiGroups: [""]
resources: ["namespaces", "pods", "services", "replicationcontrollers", "persistentvolumeclaims", "persistentvolumes"]
verbs: ["watch", "list", "get"]
- apiGroups: ["extensions"]
resources: ["replicasets", "daemonsets"]
verbs: ["watch", "list", "get"]
- apiGroups: ["policy"]
resources: ["poddisruptionbudgets"]
verbs: ["watch", "list"]
- apiGroups: ["apps"]
resources: ["statefulsets", "replicasets", "daemonsets"]
verbs: ["watch", "list", "get"]
- apiGroups: ["storage.k8s.io"]
resources: ["storageclasses", "csinodes", "csidrivers", "csistoragecapacities"]
verbs: ["watch", "list", "get"]
- apiGroups: ["batch"]
resources: ["jobs", "cronjobs"]
verbs: ["watch", "list", "get"]
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
verbs: ["create"]
- apiGroups: ["coordination.k8s.io"]
resourceNames: ["cluster-autoscaler"]
resources: ["leases"]
verbs: ["get", "update"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: cluster-autoscaler
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: cluster-autoscaler
subjects:
- kind: ServiceAccount
name: cluster-autoscaler
namespace: kube-system
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: cluster-autoscaler
namespace: kube-system
labels:
app: cluster-autoscaler
spec:
selector:
matchLabels:
app: cluster-autoscaler
replicas: 1
template:
metadata:
labels:
app: cluster-autoscaler
spec:
priorityClassName: system-cluster-critical
securityContext:
runAsNonRoot: true
runAsUser: 65534
fsGroup: 65534
serviceAccountName: cluster-autoscaler
containers:
- image: registry.k8s.io/autoscaling/cluster-autoscaler:v1.28.0
name: cluster-autoscaler
resources:
limits:
cpu: 100m
memory: 600Mi
requests:
cpu: 100m
memory: 600Mi
command:
- ./cluster-autoscaler
- --v=4
- --stderrthreshold=info
- --cloud-provider=aws
- --skip-nodes-with-local-storage=false
- --expander=least-waste
- --node-group-auto-discovery=asg:tag=k8s.io/cluster-autoscaler/enabled,k8s.io/cluster-autoscaler/$CLUSTER_NAME
- --balance-similar-node-groups
- --skip-nodes-with-system-pods=false
- --scale-down-enabled=true
- --scale-down-delay-after-add=10m
- --scale-down-unneeded-time=10m
env:
- name: AWS_REGION
value: $AWS_REGION
volumeMounts:
- name: ssl-certs
mountPath: /etc/ssl/certs/ca-certificates.crt
readOnly: true
imagePullPolicy: "Always"
volumes:
- name: ssl-certs
hostPath:
path: "/etc/ssl/certs/ca-bundle.crt"
nodeSelector:
kubernetes.io/os: linux
EOF
}
install_spark_operator() {
log "[6/8] Installing Spark Operator..."
helm upgrade --install spark-operator spark-operator/spark-operator \
--namespace $NAMESPACE_SPARK \
--set webhook.enable=true \
--set metrics.enable=true \
--set podMonitor.enable=true
}
install_monitoring() {
log "[7/8] Installing monitoring stack (Prometheus & Grafana)..."
helm upgrade --install prometheus prometheus-community/kube-prometheus-stack \
--namespace $NAMESPACE_MONITORING \
--set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \
--set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false
helm upgrade --install grafana grafana/grafana \
--namespace $NAMESPACE_MONITORING \
--set adminPassword=admin123 \
--set service.type=ClusterIP
}
verify_installation() {
log "[8/8] Verifying installation..."
log "Checking cluster autoscaler..."
kubectl wait --for=condition=available --timeout=300s deployment/cluster-autoscaler -n kube-system
log "Checking Spark operator..."
kubectl wait --for=condition=available --timeout=300s deployment/spark-operator -n $NAMESPACE_SPARK
log "Checking monitoring components..."
kubectl wait --for=condition=available --timeout=300s deployment/prometheus-kube-prometheus-prometheus-operator -n $NAMESPACE_MONITORING
log "Installation completed successfully!"
log "Namespaces created: $NAMESPACE_SPARK, $NAMESPACE_JOBS, $NAMESPACE_MONITORING"
log "To access Grafana: kubectl port-forward svc/grafana 3000:80 -n $NAMESPACE_MONITORING"
log "Grafana credentials: admin/admin123"
}
main() {
if [[ "${1:-}" == "-h" ]] || [[ "${1:-}" == "--help" ]]; then
usage
fi
detect_distro
check_prerequisites
update_system
install_kubectl
install_helm
create_namespaces
deploy_cluster_autoscaler
install_spark_operator
install_monitoring
verify_installation
}
main "$@"
Review the script before running. Execute with: bash install.sh