groups: - name: infrastructure rules: - alert: InstanceDown expr: up == 0 for: 5m labels: severity: critical annotations: summary: "Instance {{ $labels.instance }} down" description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes." - alert: HighMemoryUsage expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9 for: 5m labels: severity: warning annotations: summary: "High memory usage on {{ $labels.instance }}" description: "Memory usage is above 90% on {{ $labels.instance }}" - alert: HighCPUUsage expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 for: 5m labels: severity: warning annotations: summary: "High CPU usage on {{ $labels.instance }}" description: "CPU usage is above 80% on {{ $labels.instance }}" - name: application rules: - alert: HighErrorRate expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.1 for: 5m labels: severity: critical annotations: summary: "High error rate on {{ $labels.job }}" description: "Error rate is above 10% on {{ $labels.job }}" - alert: SlowResponseTime expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1 for: 5m labels: severity: warning annotations: summary: "Slow response time on {{ $labels.job }}" description: "95th percentile response time is above 1 second on {{ $labels.job }}" - name: database rules: - alert: PostgreSQLDown expr: up{job="postgres"} == 0 for: 1m labels: severity: critical annotations: summary: "PostgreSQL is down" description: "PostgreSQL database is not responding" - alert: Neo4jDown expr: up{job="neo4j"} == 0 for: 1m labels: severity: critical annotations: summary: "Neo4j is down" description: "Neo4j graph database is not responding" - alert: QdrantDown expr: up{job="qdrant"} == 0 for: 1m labels: severity: critical annotations: summary: "Qdrant is down" description: "Qdrant vector database is not responding" - alert: RedisDown expr: up{job="redis"} == 0 for: 1m labels: severity: critical annotations: summary: "Redis is down" description: "Redis cache is not responding"