Observability Stack - Quick Reference Guide
Fast Command Reference
Copy-paste ready commands for common observability tasks.Prometheus
Connect to Prometheus
Copy
kubectl port-forward -n observability svc/prometheus-kube-prom-prometheus 9090:9090
# Navigate to: http://localhost:9090
Check Targets
Copy
kubectl port-forward -n observability svc/prometheus-kube-prom-prometheus 9090:9090 &
curl -s http://localhost:9090/api/v1/targets | jq '.data.activeTargets | map(.labels.job) | group_by(.) | map({job: .[0], count: length}) | sort_by(-.count)'
kill %1 2>/dev/null
Reload Config
Copy
kubectl port-forward -n observability svc/prometheus-kube-prom-prometheus 9090:9090 &
curl -X POST http://localhost:9090/-/reload
kill %1 2>/dev/null
Query Metric
Copy
kubectl port-forward -n observability svc/prometheus-kube-prom-prometheus 9090:9090 &
# Example: Get cluster health
curl -s 'http://localhost:9090/api/v1/query?query=elasticsearch_cluster_health_status' | jq '.data.result'
kill %1 2>/dev/null
Check Scrape Health
Copy
kubectl port-forward -n observability svc/prometheus-kube-prom-prometheus 9090:9090 &
# Healthy targets
curl -s http://localhost:9090/api/v1/targets?state=up | jq '.data.activeTargets | length'
# Unhealthy targets
curl -s http://localhost:9090/api/v1/targets?state=down | jq '.data.droppedTargets | length'
kill %1 2>/dev/null
Alertmanager
Connect to Alertmanager
Copy
kubectl port-forward -n observability svc/prometheus-kube-prom-alertmanager 9093:9093
# Navigate to: http://localhost:9093
View Active Alerts
Copy
kubectl port-forward -n observability svc/prometheus-kube-prom-alertmanager 9093:9093 &
curl -s http://localhost:9093/api/v1/alerts | jq '.data[] | {name: .labels.alertname, severity: .labels.severity, state: .state}'
kill %1 2>/dev/null
Silence Alert (10 hours)
Copy
kubectl port-forward -n observability svc/prometheus-kube-prom-alertmanager 9093:9093 &
curl -X POST http://localhost:9093/api/v1/silences \
-H "Content-Type: application/json" \
-d '{
"matchers": [
{"name": "alertname", "value": "YOUR_ALERT_NAME", "isRegex": false}
],
"startsAt": "'$(date -u +'%Y-%m-%dT%H:%M:%S.000Z')'",
"endsAt": "'$(date -u -d "+10 hours" +'%Y-%m-%dT%H:%M:%S.000Z')'",
"createdBy": "oncall",
"comment": "Maintenance"
}'
kill %1 2>/dev/null
Remove Silence
Copy
kubectl port-forward -n observability svc/prometheus-kube-prom-alertmanager 9093:9093 &
# Get silence ID first
curl -s http://localhost:9093/api/v1/silences | jq '.data[].id'
# Delete silence
curl -X DELETE http://localhost:9093/api/v1/silences/SILENCE_ID
kill %1 2>/dev/null
Check Configuration
Copy
kubectl get alertmanagerconfig -n observability -o yaml
Test Alert Routing
Copy
kubectl port-forward -n observability svc/prometheus-kube-prom-prometheus 9090:9090 &
curl -X POST http://localhost:9090/api/v1/alerts \
-H "Content-Type: application/json" \
-d '[{
"labels": {
"alertname": "TestAlert",
"severity": "critical"
},
"annotations": {
"summary": "Test alert routing"
}
}]'
kill %1 2>/dev/null
# Wait 10s and check in Alertmanager
sleep 10
kubectl port-forward -n observability svc/prometheus-kube-prom-alertmanager 9093:9093 &
curl -s http://localhost:9093/api/v1/alerts | jq '.data[] | select(.labels.alertname=="TestAlert")'
kill %1 2>/dev/null
Grafana
Connect to Grafana
Copy
kubectl port-forward -n observability svc/prometheus-kube-prom-grafana 3000:80
# Navigate to: http://localhost:3000
# Default user: admin
# Password: kubectl get secret -n observability prometheus-kube-prom-grafana -o jsonpath='{.data.admin-password}' | base64 -d
Get Admin Password
Copy
kubectl get secret -n observability prometheus-kube-prom-grafana -o jsonpath='{.data.admin-password}' | base64 -d
List Dashboards
Copy
GRAFANA_POD=$(kubectl get pod -n observability -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}')
kubectl port-forward -n observability pod/$GRAFANA_POD 3000:3000 &
ADMIN_PASS=$(kubectl get secret -n observability prometheus-kube-prom-grafana -o jsonpath='{.data.admin-password}' | base64 -d)
curl -s -u admin:$ADMIN_PASS \
http://localhost:3000/api/search?type=dash-db | jq '.[] | {title: .title, uid: .uid}'
kill %1 2>/dev/null
Restart Grafana
Copy
kubectl rollout restart deployment prometheus-kube-prom-grafana -n observability
kubectl rollout status deployment prometheus-kube-prom-grafana -n observability
Elasticsearch & Logging
Connect to Elasticsearch
Copy
kubectl port-forward -n logging svc/elasticsearch 9200:9200
# Get password
ES_PASSWORD=$(kubectl get secret -n logging elasticsearch-es-elastic-user -o jsonpath='{.data.elastic}' | base64 -d)
# Test connection
curl -s -k -u elastic:$ES_PASSWORD https://localhost:9200/_cluster/health | jq .
Check Cluster Health
Copy
ES_PASSWORD=$(kubectl get secret -n logging elasticsearch-es-elastic-user -o jsonpath='{.data.elastic}' | base64 -d)
kubectl exec -n logging elasticsearch-es-default-0 -- \
curl -s -k -u elastic:$ES_PASSWORD https://localhost:9200/_cluster/health?pretty
List Indices
Copy
ES_PASSWORD=$(kubectl get secret -n logging elasticsearch-es-elastic-user -o jsonpath='{.data.elastic}' | base64 -d)
kubectl exec -n logging elasticsearch-es-default-0 -- \
curl -s -k -u elastic:$ES_PASSWORD 'https://localhost:9200/_cat/indices?v'
Count Documents in Index
Copy
ES_PASSWORD=$(kubectl get secret -n logging elasticsearch-es-elastic-user -o jsonpath='{.data.elastic}' | base64 -d)
kubectl exec -n logging elasticsearch-es-default-0 -- \
curl -s -k -u elastic:$ES_PASSWORD 'https://localhost:9200/app-*/_count' | jq .count
Delete Index
Copy
ES_PASSWORD=$(kubectl get secret -n logging elasticsearch-es-elastic-user -o jsonpath='{.data.elastic}' | base64 -d)
kubectl exec -n logging elasticsearch-es-default-0 -- \
curl -s -k -u elastic:$ES_PASSWORD -X DELETE 'https://localhost:9200/debug-2026.02.01'
Check Fluentd Status
Copy
# Pods
kubectl get pods -n logging -l app.kubernetes.io/name=fluentd
# Logs
kubectl logs -n logging -l app.kubernetes.io/name=fluentd --tail=50 -f
Connect to Kibana
Copy
kubectl port-forward -n logging svc/kibana-kb-http 5601:5601
# Navigate to: http://localhost:5601
# Username: elastic
# Password: kubectl get secret -n logging elasticsearch-es-elastic-user -o jsonpath='{.data.elastic}' | base64 -d
Elasticsearch Exporter
Check Exporter Metrics
Copy
kubectl port-forward -n logging <exporter-pod-name> 9114:9114 &
curl -s http://localhost:9114/metrics | grep elasticsearch | head -20
kill %1 2>/dev/null
Check ServiceMonitor
Copy
kubectl get servicemonitor -n logging es-exporter -o yaml
Verify in Prometheus
Copy
kubectl port-forward -n observability svc/prometheus-kube-prom-prometheus 9090:9090 &
curl -s 'http://localhost:9090/api/v1/targets?match[]=elasticsearch-exporter' | jq '.data.activeTargets'
kill %1 2>/dev/null
ExternalSecrets
Check ExternalSecret Status
Copy
# List all
kubectl get externalsecret -n observability
# Check specific
kubectl describe externalsecret alertmanager-pagerduty-secret -n observability
# Expected status: SecretSynced
Verify Secret Created
Copy
# Check secret exists
kubectl get secret alertmanager-pagerduty-secret -n observability
# View secret metadata (don't print values!)
kubectl describe secret alertmanager-pagerduty-secret -n observability
Troubleshoot Sync Error
Copy
# Check ClusterSecretStore
kubectl describe clustersecretstore aws-secrets
# Check ExternalSecret events
kubectl describe externalsecret alertmanager-pagerduty-secret -n observability
# Check if secret exists in AWS
aws secretsmanager describe-secret --secret-id /sparki/prod/pagerduty/routing-key
Kubernetes Resources
Pod Status
Copy
# All observability pods
kubectl get pods -n observability -o wide
# All logging pods
kubectl get pods -n logging -o wide
# Watch pod status
kubectl get pods -n observability -w
Resource Usage
Copy
# Top pods by memory
kubectl top pods -n observability --sort-by=memory
# Top pods by CPU
kubectl top pods -n observability --sort-by=cpu
Pod Logs
Copy
# Prometheus
kubectl logs -n observability -l app.kubernetes.io/name=prometheus --tail=50
# Alertmanager
kubectl logs -n observability -l app.kubernetes.io/name=alertmanager --tail=50
# Grafana
kubectl logs -n observability -l app.kubernetes.io/name=grafana --tail=50
# Follow logs
kubectl logs -n observability -l app=prometheus -f
Restart Components
Copy
# Restart Prometheus
kubectl rollout restart statefulset prometheus-kube-prom-prometheus -n observability
# Restart Alertmanager
kubectl rollout restart statefulset prometheus-kube-prom-alertmanager -n observability
# Restart Grafana
kubectl rollout restart deployment prometheus-kube-prom-grafana -n observability
# Restart Fluentd
kubectl rollout restart daemonset fluentd -n logging
Health Checks
Quick Health Check
Copy
#!/bin/bash
echo "=== Quick Observability Health Check ==="
# Prometheus
PROM=$(kubectl get pod -n observability -l app.kubernetes.io/name=prometheus -o jsonpath='{.items[0].status.phase}')
echo "Prometheus: $PROM"
# Alertmanager
ALERT=$(kubectl get pod -n observability -l app.kubernetes.io/name=alertmanager -o jsonpath='{.items[0].status.phase}')
echo "Alertmanager: $ALERT"
# Grafana
GRAF=$(kubectl get pod -n observability -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].status.phase}')
echo "Grafana: $GRAF"
# Elasticsearch
ES=$(kubectl get pod -n logging -l elasticsearch.k8s.elastic.co/cluster-name=elasticsearch -o jsonpath='{.items[0].status.phase}')
echo "Elasticsearch: $ES"
# Fluentd
FLUENT=$(kubectl get daemonset -n logging fluentd -o jsonpath='{.status.numberReady}/{.status.desiredNumberScheduled}')
echo "Fluentd: $FLUENT"
# ExternalSecrets
EXT=$(kubectl get externalsecret -n observability alertmanager-pagerduty-secret -o jsonpath='{.status.lastSyncTime}')
echo "ExternalSecrets last sync: $EXT"
[ "$PROM" = "Running" ] && [ "$ALERT" = "Running" ] && [ "$GRAF" = "Running" ] && [ "$ES" = "Running" ] && echo "✓ All healthy" || echo "✗ Check failed components"
Deep Health Check
Copy
# Run full verification script
bash ./infra/kubernetes-manifests/base/observability/scripts/verify-deployment.sh
Debugging
Check Event Logs
Copy
# Get recent events
kubectl get events -n observability --sort-by='.lastTimestamp' | tail -20
# Watch events
kubectl get events -n observability -w
Describe Pod
Copy
# Full pod details including events
kubectl describe pod -n observability <pod-name>
View Pod Logs with Timestamps
Copy
# Prometheus logs with timestamps
kubectl logs -n observability -l app.kubernetes.io/name=prometheus --timestamps=true | tail -20
Execute Commands in Pod
Copy
# Test connectivity from Alertmanager
kubectl exec -n observability prometheus-kube-prom-alertmanager-0 -- \
curl -s https://api.pagerduty.com/ping
# Test connectivity from Fluentd
kubectl exec -n logging <fluentd-pod> -- \
curl -k https://elasticsearch.logging.svc.cluster.local:9200/_cluster/health
Emergency Commands
Force Restart Pod
Copy
# Delete pod (it will be recreated)
kubectl delete pod -n observability <pod-name>
Increase Resource Limits
Copy
# Edit Prometheus resource limits
kubectl edit prometheus -n observability
# Modify: spec.prometheusSpec.resources.limits
Reduce Data Retention
Copy
# Quick reduce to save space
kubectl patch prometheus -n observability \
--type merge \
-p '{"spec":{"retention":"7d"}}'
Disable Alert Routing (Emergency)
Copy
# Add "halt" receiver (discards all alerts)
kubectl edit alertmanagerconfig -n observability
# Under routes, add:
# - receiver: 'halt'
# group_by: ['alertname']
Tips
- Always port-forward rather than expose services externally for security
- Use
jqfor JSON parsing - it makes Prometheus API queries readable - Save commands as shell functions in ~/.bashrc for repeated use
- Use kubectl aliases like
k=kubectlto save typing - Test in staging first before running any commands in production