Skip to main content

Quick Reference: Deployment Commands

Environment Variables (Set Before Running)

export AWS_REGION="us-east-1"
export PROJECT_NAME="sparki"
export ENVIRONMENT="prod"  # or "staging", "dev"
export CLUSTER_NAME="sparki-prod"
export NAMESPACE="sparki-engine"
export NEW_VERSION="v1.2.3"

1. Initial Infrastructure Deployment

# Navigate to terraform directory
cd infrastructure/terraform

# Initialize Terraform
terraform init

# Format and validate
terraform fmt -recursive
terraform validate

# Plan changes (preview)
terraform plan \
  -var-file="environments/${ENVIRONMENT}.tfvars" \
  -out=tfplan

# Review tfplan output carefully!

# Apply infrastructure
terraform apply tfplan

# Capture outputs
terraform output -json > outputs.json

2. Deploy Application (Simple)

# Execute deployment script
./infrastructure/scripts/deploy.sh ${ENVIRONMENT} ${NEW_VERSION}

# Wait for rollout
kubectl rollout status deployment/sparki-engine -n ${NAMESPACE} --timeout=600s
kubectl rollout status deployment/sparki-web -n ${NAMESPACE} --timeout=600s

# Run health checks
./infrastructure/scripts/health-check.sh ${ENVIRONMENT}

# Execute blue-green deployment
./infrastructure/scripts/deploy-blue-green.sh ${ENVIRONMENT} ${NEW_VERSION}

# Monitor in Grafana
# Dashboard: Command Center
# Watch: Error Rate, Latency, Pod Status

# Wait 2 hours for stability window

# Success! Check:
kubectl get pods -n ${NAMESPACE} -o wide
curl http://$(kubectl get svc sparki-engine-lb -n ${NAMESPACE} -o jsonpath='{.status.loadBalancer.ingress[0].hostname}')/health

4. Rollback (Emergency)

# Immediate rollback
./infrastructure/scripts/rollback.sh ${ENVIRONMENT}

# Verify rollback
kubectl rollout status deployment/sparki-engine-blue -n ${NAMESPACE}
./infrastructure/scripts/health-check.sh ${ENVIRONMENT}

# Check logs
kubectl logs deployment/sparki-engine-blue -n ${NAMESPACE} --tail=50

5. Database Operations

View Database Info

# Get RDS endpoint
aws rds describe-db-instances \
  --db-instance-identifier ${PROJECT_NAME}-${ENVIRONMENT} \
  --query 'DBInstances[0].[Endpoint.Address,DBInstanceStatus]'

# Connect to database (from pod)
POD=$(kubectl get pods -n ${NAMESPACE} -l app=sparki-engine -o name | head -1)
kubectl exec -it $POD -n ${NAMESPACE} -- \
  psql -h $DB_HOST -U sparki_admin -d sparki

Run Migrations

# Create migration job
kubectl apply -f - <<EOF
apiVersion: batch/v1
kind: Job
metadata:
  name: db-migration-${NEW_VERSION}
  namespace: ${NAMESPACE}
spec:
  template:
    spec:
      containers:
      - name: migrate
        image: ghcr.io/alexarno/sparki/engine:${NEW_VERSION}
        command: ["./migrate", "up"]
        env:
        - name: DATABASE_URL
          valueFrom:
            secretKeyRef:
              name: db-credentials
              key: url
      restartPolicy: Never
EOF

# Watch migration
kubectl logs -f job/db-migration-${NEW_VERSION} -n ${NAMESPACE}

Backup Database

# Create snapshot
aws rds create-db-snapshot \
  --db-instance-identifier ${PROJECT_NAME}-${ENVIRONMENT} \
  --db-snapshot-identifier ${PROJECT_NAME}-${ENVIRONMENT}-$(date +%Y%m%d-%H%M%S)

# List snapshots
aws rds describe-db-snapshots \
  --db-instance-identifier ${PROJECT_NAME}-${ENVIRONMENT} \
  --query 'DBSnapshots[*].[DBSnapshotIdentifier,CreateTime]'

Restore from Backup

# Create new instance from snapshot
aws rds restore-db-instance-from-db-snapshot \
  --db-instance-identifier ${PROJECT_NAME}-${ENVIRONMENT}-restored \
  --db-snapshot-identifier ${SNAPSHOT_ID}

# Verify restore completed
aws rds describe-db-instances \
  --db-instance-identifier ${PROJECT_NAME}-${ENVIRONMENT}-restored \
  --query 'DBInstances[0].DBInstanceStatus'

6. Cache Operations

Check Redis Health

# Get Redis endpoint
REDIS_ENDPOINT=$(terraform output -raw redis_endpoint)

# Connect from pod
POD=$(kubectl get pods -n ${NAMESPACE} -o name | head -1)
kubectl exec $POD -n ${NAMESPACE} -- \
  redis-cli -h ${REDIS_ENDPOINT} ping

# Check memory usage
kubectl exec $POD -n ${NAMESPACE} -- \
  redis-cli -h ${REDIS_ENDPOINT} info memory

Flush Cache

# WARNING: Clears all cache data

REDIS_ENDPOINT=$(terraform output -raw redis_endpoint)
POD=$(kubectl get pods -n ${NAMESPACE} -o name | head -1)

kubectl exec $POD -n ${NAMESPACE} -- \
  redis-cli -h ${REDIS_ENDPOINT} FLUSHALL

# Restart pods to refill cache
kubectl delete pods -n ${NAMESPACE} -l app=sparki-engine

7. Monitoring & Logging

View Logs

# Last 50 lines
kubectl logs deployment/sparki-engine -n ${NAMESPACE} --tail=50

# Follow logs in real-time
kubectl logs -f deployment/sparki-engine -n ${NAMESPACE}

# All containers
kubectl logs deployment/sparki-engine -n ${NAMESPACE} --all-containers=true

# Logs from specific pod
kubectl logs pod/sparki-engine-abc123 -n ${NAMESPACE}

# Export logs for analysis
kubectl logs deployment/sparki-engine -n ${NAMESPACE} --since=1h > logs-last-hour.txt

View Metrics (via kubectl)

# Pod resource usage
kubectl top pods -n ${NAMESPACE}

# Node resource usage
kubectl top nodes

# Detailed pod metrics
kubectl describe pod $(kubectl get pods -n ${NAMESPACE} -o name | head -1) -n ${NAMESPACE}

View Metrics (via Grafana)

Dashboard: Command Center
  → Error Rate
  → Request Latency (P50/P95/P99)
  → Pod Health
  → CPU/Memory Usage

Dashboard: Reliability SLO
  → Error Budget Remaining
  → Burn Rate (30-min and 24-hour)
  → SLO Compliance

Search Traces (Jaeger)

Service: sparki-engine
Operation: (Select operation)
Tags:
  error: true  (to find errors only)
Limit: 20
Min Duration: 1000ms (find slow requests)

8. Troubleshooting Commands

Check Cluster Status

# Nodes
kubectl get nodes -o wide
kubectl describe nodes | grep -A 5 "Allocatable"

# Pods
kubectl get pods -n ${NAMESPACE} -o wide
kubectl get pods -n ${NAMESPACE} --field-selector=status.phase=Failed

# Events
kubectl events -n ${NAMESPACE} --sort-by='.lastTimestamp'

# Services
kubectl get svc -n ${NAMESPACE}
kubectl describe svc sparki-engine-lb -n ${NAMESPACE}

Debug Pod

# Exec into pod
kubectl exec -it pod/sparki-engine-abc123 -n ${NAMESPACE} -- /bin/bash

# Run commands
kubectl exec pod/sparki-engine-abc123 -n ${NAMESPACE} -- \
  curl -v http://localhost:8080/health

# Copy files from pod
kubectl cp ${NAMESPACE}/pod/sparki-engine-abc123:/var/log/app.log ./app.log

Check Network Connectivity

# From pod to external service
POD=$(kubectl get pods -n ${NAMESPACE} -o name | head -1)

# Ping
kubectl exec $POD -n ${NAMESPACE} -- ping 8.8.8.8

# DNS resolution
kubectl exec $POD -n ${NAMESPACE} -- nslookup example.com

# Port connectivity
kubectl exec $POD -n ${NAMESPACE} -- nc -zv ${DB_HOST} 5432

Check Secrets

# List secrets
kubectl get secrets -n ${NAMESPACE}

# View secret (base64 decoded)
kubectl get secret db-credentials -n ${NAMESPACE} \
  -o jsonpath='{.data.password}' | base64 -d

# Create secret
kubectl create secret generic db-credentials \
  -n ${NAMESPACE} \
  --from-literal=username=sparki_admin \
  --from-literal=password=MY_SECRET_PASSWORD

9. Scaling Operations

Scale Deployment

# Increase replicas
kubectl scale deployment sparki-engine \
  -n ${NAMESPACE} \
  --replicas=5

# Wait for scaling
kubectl rollout status deployment/sparki-engine -n ${NAMESPACE}

Scale Node Group (AWS)

# Update desired size
aws eks update-nodegroup-config \
  --cluster-name ${CLUSTER_NAME} \
  --nodegroup-name sparki-workers \
  --scaling-config minSize=1,maxSize=10,desiredSize=5

# Monitor scaling
aws ec2 describe-auto-scaling-groups \
  --query 'AutoScalingGroups[*].[AutoScalingGroupName,DesiredCapacity,Instances[].InstanceId]'

10. Infrastructure Validation

Validate Terraform

# Format check
terraform fmt -check -recursive

# Syntax validation
terraform validate

# Security scanning
tflint

# Cost estimation
terraform plan -var-file="environments/prod.tfvars" | grep -A 100 "Plan:"

Validate Deployment

# Manifest validation
kubectl apply -f deployment.yaml --dry-run=client

# Policy validation
kubectl get pods -n ${NAMESPACE} -o jsonpath='{.items[*].spec.securityContext}'

11. Emergency Procedures

Service Restart

# Restart all pods (causes brief downtime)
kubectl rollout restart deployment/sparki-engine -n ${NAMESPACE}
kubectl rollout restart deployment/sparki-web -n ${NAMESPACE}

# Wait for restart
kubectl rollout status deployment/sparki-engine -n ${NAMESPACE}

Scale to Zero (Pause Service)

# Stop processing
kubectl scale deployment sparki-engine -n ${NAMESPACE} --replicas=0

# Resume
kubectl scale deployment sparki-engine -n ${NAMESPACE} --replicas=3

Force Delete Stuck Pod

# Only if pod is stuck in Unknown state
kubectl delete pod pod-name -n ${NAMESPACE} --grace-period=0 --force

Delete and Recreate Deployment

# CAUTION: Causes downtime

# Backup current
kubectl get deployment sparki-engine -n ${NAMESPACE} -o yaml > backup.yaml

# Delete
kubectl delete deployment sparki-engine -n ${NAMESPACE}

# Recreate
kubectl apply -f backup.yaml

12. Useful Aliases

Add to .bashrc or .zshrc:
# Kubernetes aliases
alias k='kubectl'
alias kg='kubectl get'
alias kd='kubectl delete'
alias kl='kubectl logs'
alias kex='kubectl exec -it'
alias kdesc='kubectl describe'
alias kgp='kubectl get pods'
alias kgd='kubectl get deployments'
alias kgs='kubectl get services'

# Sparki-specific
alias sparki-logs='kubectl logs -f deployment/sparki-engine -n sparki-engine'
alias sparki-pods='kubectl get pods -n sparki-engine -o wide'
alias sparki-health='./infrastructure/scripts/health-check.sh'
alias sparki-rollback='./infrastructure/scripts/rollback.sh'
alias sparki-deploy='./infrastructure/scripts/deploy.sh'

13. Common Workflows

Weekly Status Check

#!/bin/bash
echo "=== Cluster Status ==="
kubectl get nodes -o wide

echo "=== Pod Health ==="
kubectl get pods -n sparki-engine -o wide

echo "=== Error Rate ==="
# Check Grafana Command Center dashboard

echo "=== Recent Events ==="
kubectl events -n sparki-engine --sort-by='.lastTimestamp' | head -10

echo "=== Disk Space ==="
kubectl top nodes

Pre-Deployment Checklist

#!/bin/bash
set -e

echo "✓ Database accessible"
curl -s http://localhost:8080/api/health || echo "✗ FAILED"

echo "✓ Cache accessible"
redis-cli -h ${REDIS_ENDPOINT} ping || echo "✗ FAILED"

echo "✓ Current version"
kubectl get deployment sparki-engine -n sparki-engine \
  -o jsonpath='{.spec.template.spec.containers[0].image}'

echo "✓ Pod replicas"
kubectl get deployment sparki-engine -n sparki-engine \
  -o jsonpath='{.spec.replicas}'

Post-Deployment Validation

#!/bin/bash
set -e

echo "Waiting for rollout..."
kubectl rollout status deployment/sparki-engine -n sparki-engine --timeout=600s

echo "Running health checks..."
./infrastructure/scripts/health-check.sh prod

echo "Checking error rate..."
sleep 30
# Query Prometheus for error_rate
curl -s 'http://prometheus:9090/api/v1/query?query=error_rate' | jq '.data.result'

Quick Fixes

IssueQuick Fix
Pod CrashLoopkubectl logs <pod> -n sparki-engine → check logs for errors
High latencykubectl top pods -n sparki-engine → check CPU/memory
DB connection errorsnc -zv $DB_HOST 5432 → verify connectivity
Image pull errorkubectl describe pod <pod> -n sparki-engine → check image tag
Out of memorykubectl scale deployment sparki-engine -n sparki-engine --replicas=<new-count>
Pod pendingkubectl describe pod <pod> -n sparki-engine → check resources/affinity

Emergency Contact

  • On-Call SRE: Page via PagerDuty
  • Platform Lead: @alexarno
  • Incident Commander: @incident-commander

Last Updated: December 2025
Version: 1.0