Skip to main content

Observability Stack - Quick Reference Guide

Fast Command Reference

Copy-paste ready commands for common observability tasks.

Prometheus

Connect to Prometheus

kubectl port-forward -n observability svc/prometheus-kube-prom-prometheus 9090:9090
# Navigate to: http://localhost:9090

Check Targets

kubectl port-forward -n observability svc/prometheus-kube-prom-prometheus 9090:9090 &
curl -s http://localhost:9090/api/v1/targets | jq '.data.activeTargets | map(.labels.job) | group_by(.) | map({job: .[0], count: length}) | sort_by(-.count)'
kill %1 2>/dev/null

Reload Config

kubectl port-forward -n observability svc/prometheus-kube-prom-prometheus 9090:9090 &
curl -X POST http://localhost:9090/-/reload
kill %1 2>/dev/null

Query Metric

kubectl port-forward -n observability svc/prometheus-kube-prom-prometheus 9090:9090 &

# Example: Get cluster health
curl -s 'http://localhost:9090/api/v1/query?query=elasticsearch_cluster_health_status' | jq '.data.result'

kill %1 2>/dev/null

Check Scrape Health

kubectl port-forward -n observability svc/prometheus-kube-prom-prometheus 9090:9090 &

# Healthy targets
curl -s http://localhost:9090/api/v1/targets?state=up | jq '.data.activeTargets | length'

# Unhealthy targets
curl -s http://localhost:9090/api/v1/targets?state=down | jq '.data.droppedTargets | length'

kill %1 2>/dev/null

Alertmanager

Connect to Alertmanager

kubectl port-forward -n observability svc/prometheus-kube-prom-alertmanager 9093:9093
# Navigate to: http://localhost:9093

View Active Alerts

kubectl port-forward -n observability svc/prometheus-kube-prom-alertmanager 9093:9093 &
curl -s http://localhost:9093/api/v1/alerts | jq '.data[] | {name: .labels.alertname, severity: .labels.severity, state: .state}'
kill %1 2>/dev/null

Silence Alert (10 hours)

kubectl port-forward -n observability svc/prometheus-kube-prom-alertmanager 9093:9093 &

curl -X POST http://localhost:9093/api/v1/silences \
  -H "Content-Type: application/json" \
  -d '{
    "matchers": [
      {"name": "alertname", "value": "YOUR_ALERT_NAME", "isRegex": false}
    ],
    "startsAt": "'$(date -u +'%Y-%m-%dT%H:%M:%S.000Z')'",
    "endsAt": "'$(date -u -d "+10 hours" +'%Y-%m-%dT%H:%M:%S.000Z')'",
    "createdBy": "oncall",
    "comment": "Maintenance"
  }'

kill %1 2>/dev/null

Remove Silence

kubectl port-forward -n observability svc/prometheus-kube-prom-alertmanager 9093:9093 &

# Get silence ID first
curl -s http://localhost:9093/api/v1/silences | jq '.data[].id'

# Delete silence
curl -X DELETE http://localhost:9093/api/v1/silences/SILENCE_ID

kill %1 2>/dev/null

Check Configuration

kubectl get alertmanagerconfig -n observability -o yaml

Test Alert Routing

kubectl port-forward -n observability svc/prometheus-kube-prom-prometheus 9090:9090 &

curl -X POST http://localhost:9090/api/v1/alerts \
  -H "Content-Type: application/json" \
  -d '[{
    "labels": {
      "alertname": "TestAlert",
      "severity": "critical"
    },
    "annotations": {
      "summary": "Test alert routing"
    }
  }]'

kill %1 2>/dev/null

# Wait 10s and check in Alertmanager
sleep 10
kubectl port-forward -n observability svc/prometheus-kube-prom-alertmanager 9093:9093 &
curl -s http://localhost:9093/api/v1/alerts | jq '.data[] | select(.labels.alertname=="TestAlert")'
kill %1 2>/dev/null

Grafana

Connect to Grafana

kubectl port-forward -n observability svc/prometheus-kube-prom-grafana 3000:80
# Navigate to: http://localhost:3000
# Default user: admin
# Password: kubectl get secret -n observability prometheus-kube-prom-grafana -o jsonpath='{.data.admin-password}' | base64 -d

Get Admin Password

kubectl get secret -n observability prometheus-kube-prom-grafana -o jsonpath='{.data.admin-password}' | base64 -d

List Dashboards

GRAFANA_POD=$(kubectl get pod -n observability -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}')

kubectl port-forward -n observability pod/$GRAFANA_POD 3000:3000 &

ADMIN_PASS=$(kubectl get secret -n observability prometheus-kube-prom-grafana -o jsonpath='{.data.admin-password}' | base64 -d)

curl -s -u admin:$ADMIN_PASS \
  http://localhost:3000/api/search?type=dash-db | jq '.[] | {title: .title, uid: .uid}'

kill %1 2>/dev/null

Restart Grafana

kubectl rollout restart deployment prometheus-kube-prom-grafana -n observability
kubectl rollout status deployment prometheus-kube-prom-grafana -n observability

Elasticsearch & Logging

Connect to Elasticsearch

kubectl port-forward -n logging svc/elasticsearch 9200:9200

# Get password
ES_PASSWORD=$(kubectl get secret -n logging elasticsearch-es-elastic-user -o jsonpath='{.data.elastic}' | base64 -d)

# Test connection
curl -s -k -u elastic:$ES_PASSWORD https://localhost:9200/_cluster/health | jq .

Check Cluster Health

ES_PASSWORD=$(kubectl get secret -n logging elasticsearch-es-elastic-user -o jsonpath='{.data.elastic}' | base64 -d)

kubectl exec -n logging elasticsearch-es-default-0 -- \
  curl -s -k -u elastic:$ES_PASSWORD https://localhost:9200/_cluster/health?pretty

List Indices

ES_PASSWORD=$(kubectl get secret -n logging elasticsearch-es-elastic-user -o jsonpath='{.data.elastic}' | base64 -d)

kubectl exec -n logging elasticsearch-es-default-0 -- \
  curl -s -k -u elastic:$ES_PASSWORD 'https://localhost:9200/_cat/indices?v'

Count Documents in Index

ES_PASSWORD=$(kubectl get secret -n logging elasticsearch-es-elastic-user -o jsonpath='{.data.elastic}' | base64 -d)

kubectl exec -n logging elasticsearch-es-default-0 -- \
  curl -s -k -u elastic:$ES_PASSWORD 'https://localhost:9200/app-*/_count' | jq .count

Delete Index

ES_PASSWORD=$(kubectl get secret -n logging elasticsearch-es-elastic-user -o jsonpath='{.data.elastic}' | base64 -d)

kubectl exec -n logging elasticsearch-es-default-0 -- \
  curl -s -k -u elastic:$ES_PASSWORD -X DELETE 'https://localhost:9200/debug-2026.02.01'

Check Fluentd Status

# Pods
kubectl get pods -n logging -l app.kubernetes.io/name=fluentd

# Logs
kubectl logs -n logging -l app.kubernetes.io/name=fluentd --tail=50 -f

Connect to Kibana

kubectl port-forward -n logging svc/kibana-kb-http 5601:5601
# Navigate to: http://localhost:5601
# Username: elastic
# Password: kubectl get secret -n logging elasticsearch-es-elastic-user -o jsonpath='{.data.elastic}' | base64 -d

Elasticsearch Exporter

Check Exporter Metrics

kubectl port-forward -n logging <exporter-pod-name> 9114:9114 &
curl -s http://localhost:9114/metrics | grep elasticsearch | head -20
kill %1 2>/dev/null

Check ServiceMonitor

kubectl get servicemonitor -n logging es-exporter -o yaml

Verify in Prometheus

kubectl port-forward -n observability svc/prometheus-kube-prom-prometheus 9090:9090 &

curl -s 'http://localhost:9090/api/v1/targets?match[]=elasticsearch-exporter' | jq '.data.activeTargets'

kill %1 2>/dev/null

ExternalSecrets

Check ExternalSecret Status

# List all
kubectl get externalsecret -n observability

# Check specific
kubectl describe externalsecret alertmanager-pagerduty-secret -n observability

# Expected status: SecretSynced

Verify Secret Created

# Check secret exists
kubectl get secret alertmanager-pagerduty-secret -n observability

# View secret metadata (don't print values!)
kubectl describe secret alertmanager-pagerduty-secret -n observability

Troubleshoot Sync Error

# Check ClusterSecretStore
kubectl describe clustersecretstore aws-secrets

# Check ExternalSecret events
kubectl describe externalsecret alertmanager-pagerduty-secret -n observability

# Check if secret exists in AWS
aws secretsmanager describe-secret --secret-id /sparki/prod/pagerduty/routing-key

Kubernetes Resources

Pod Status

# All observability pods
kubectl get pods -n observability -o wide

# All logging pods
kubectl get pods -n logging -o wide

# Watch pod status
kubectl get pods -n observability -w

Resource Usage

# Top pods by memory
kubectl top pods -n observability --sort-by=memory

# Top pods by CPU
kubectl top pods -n observability --sort-by=cpu

Pod Logs

# Prometheus
kubectl logs -n observability -l app.kubernetes.io/name=prometheus --tail=50

# Alertmanager
kubectl logs -n observability -l app.kubernetes.io/name=alertmanager --tail=50

# Grafana
kubectl logs -n observability -l app.kubernetes.io/name=grafana --tail=50

# Follow logs
kubectl logs -n observability -l app=prometheus -f

Restart Components

# Restart Prometheus
kubectl rollout restart statefulset prometheus-kube-prom-prometheus -n observability

# Restart Alertmanager
kubectl rollout restart statefulset prometheus-kube-prom-alertmanager -n observability

# Restart Grafana
kubectl rollout restart deployment prometheus-kube-prom-grafana -n observability

# Restart Fluentd
kubectl rollout restart daemonset fluentd -n logging

Health Checks

Quick Health Check

#!/bin/bash
echo "=== Quick Observability Health Check ==="

# Prometheus
PROM=$(kubectl get pod -n observability -l app.kubernetes.io/name=prometheus -o jsonpath='{.items[0].status.phase}')
echo "Prometheus: $PROM"

# Alertmanager
ALERT=$(kubectl get pod -n observability -l app.kubernetes.io/name=alertmanager -o jsonpath='{.items[0].status.phase}')
echo "Alertmanager: $ALERT"

# Grafana
GRAF=$(kubectl get pod -n observability -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].status.phase}')
echo "Grafana: $GRAF"

# Elasticsearch
ES=$(kubectl get pod -n logging -l elasticsearch.k8s.elastic.co/cluster-name=elasticsearch -o jsonpath='{.items[0].status.phase}')
echo "Elasticsearch: $ES"

# Fluentd
FLUENT=$(kubectl get daemonset -n logging fluentd -o jsonpath='{.status.numberReady}/{.status.desiredNumberScheduled}')
echo "Fluentd: $FLUENT"

# ExternalSecrets
EXT=$(kubectl get externalsecret -n observability alertmanager-pagerduty-secret -o jsonpath='{.status.lastSyncTime}')
echo "ExternalSecrets last sync: $EXT"

[ "$PROM" = "Running" ] && [ "$ALERT" = "Running" ] && [ "$GRAF" = "Running" ] && [ "$ES" = "Running" ] && echo "✓ All healthy" || echo "✗ Check failed components"

Deep Health Check

# Run full verification script
bash ./infra/kubernetes-manifests/base/observability/scripts/verify-deployment.sh

Debugging

Check Event Logs

# Get recent events
kubectl get events -n observability --sort-by='.lastTimestamp' | tail -20

# Watch events
kubectl get events -n observability -w

Describe Pod

# Full pod details including events
kubectl describe pod -n observability <pod-name>

View Pod Logs with Timestamps

# Prometheus logs with timestamps
kubectl logs -n observability -l app.kubernetes.io/name=prometheus --timestamps=true | tail -20

Execute Commands in Pod

# Test connectivity from Alertmanager
kubectl exec -n observability prometheus-kube-prom-alertmanager-0 -- \
  curl -s https://api.pagerduty.com/ping

# Test connectivity from Fluentd
kubectl exec -n logging <fluentd-pod> -- \
  curl -k https://elasticsearch.logging.svc.cluster.local:9200/_cluster/health

Emergency Commands

Force Restart Pod

# Delete pod (it will be recreated)
kubectl delete pod -n observability <pod-name>

Increase Resource Limits

# Edit Prometheus resource limits
kubectl edit prometheus -n observability
# Modify: spec.prometheusSpec.resources.limits

Reduce Data Retention

# Quick reduce to save space
kubectl patch prometheus -n observability \
  --type merge \
  -p '{"spec":{"retention":"7d"}}'

Disable Alert Routing (Emergency)

# Add "halt" receiver (discards all alerts)
kubectl edit alertmanagerconfig -n observability

# Under routes, add:
# - receiver: 'halt'
#   group_by: ['alertname']

Tips

  • Always port-forward rather than expose services externally for security
  • Use jq for JSON parsing - it makes Prometheus API queries readable
  • Save commands as shell functions in ~/.bashrc for repeated use
  • Use kubectl aliases like k=kubectl to save typing
  • Test in staging first before running any commands in production

See Also