Monitoring Setup
Overview
Comprehensive monitoring setup for Solar-Log Enterprise using Prometheus, Grafana, and Loki.
Architecture
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
β Grafana (Port 3001) β
β Dashboards & Alerting β
βββββββββββββββββ¬ββββββββββββββββββββββββββββββββββββββββββ
β
βββββββββββββ΄βββββββββββ¬βββββββββββββββββββ
β β β
βββββΌβββββββββ βββββββββββΌβββββββββ βββββββΌβββββββ
β Prometheus β β Loki β β AlertMgr β
β (Metrics) β β (Logs) β β (Alerts) β
βββββ¬βββββββββ βββββββββββ¬βββββββββ ββββββββββββββ
β β
β βββββββββββΌβββββββββ
β β Promtail β
β β (Log Shipper) β
β ββββββββββββββββββββ
β
βββββΌββββββββββββββββββββββββββββββββββββββββββββββββββββββ
β Application Services β
β ββββββββββββ ββββββββββββ ββββββββββββ β
β β Backend β β Frontend β β Nginx β β
β ββββββββββββ ββββββββββββ ββββββββββββ β
βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
Prometheus Setup
docker-compose.monitoring.yml
version: '3.8'
services:
prometheus:
image: prom/prometheus:latest
container_name: solarlog-prometheus
restart: always
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=30d'
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- prometheus-data:/prometheus
ports:
- "9090:9090"
networks:
- monitoring
grafana:
image: grafana/grafana:latest
container_name: solarlog-grafana
restart: always
environment:
- GF_SECURITY_ADMIN_USER=${GRAFANA_USER:-admin}
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD:-admin}
- GF_USERS_ALLOW_SIGN_UP=false
volumes:
- grafana-data:/var/lib/grafana
- ./grafana/dashboards:/etc/grafana/provisioning/dashboards:ro
- ./grafana/datasources:/etc/grafana/provisioning/datasources:ro
ports:
- "3001:3000"
networks:
- monitoring
depends_on:
- prometheus
node-exporter:
image: prom/node-exporter:latest
container_name: solarlog-node-exporter
restart: always
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
ports:
- "9100:9100"
networks:
- monitoring
volumes:
prometheus-data:
grafana-data:
networks:
monitoring:
driver: bridge
prometheus.yml
# prometheus/prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: 'solarlog-production'
environment: 'production'
# Alerting configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- 'alertmanager:9093'
# Load rules
rule_files:
- "rules/*.yml"
# Scrape configurations
scrape_configs:
# Prometheus itself
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# Node Exporter (System Metrics)
- job_name: 'node'
static_configs:
- targets: ['node-exporter:9100']
labels:
instance: 'solarlog-server'
# Backend Application
- job_name: 'backend'
static_configs:
- targets: ['backend:8000']
labels:
service: 'fastapi'
metrics_path: '/metrics'
# PostgreSQL Exporter
- job_name: 'postgres'
static_configs:
- targets: ['postgres-exporter:9187']
# Nginx Exporter
- job_name: 'nginx'
static_configs:
- targets: ['nginx-exporter:9113']
# Docker Containers
- job_name: 'docker'
static_configs:
- targets: ['cadvisor:8080']
Backend Metrics Integration
Add Prometheus to FastAPI
# backend/app/main.py
from prometheus_fastapi_instrumentator import Instrumentator
app = FastAPI()
# Prometheus metrics
Instrumentator().instrument(app).expose(app, endpoint="/metrics")
# Custom metrics
from prometheus_client import Counter, Histogram, Gauge
inverter_poll_counter = Counter(
'inverter_polls_total',
'Total number of inverter polls',
['inverter_id', 'status']
)
inverter_power_gauge = Gauge(
'inverter_current_power_watts',
'Current power production in watts',
['inverter_id', 'manufacturer']
)
response_time_histogram = Histogram(
'http_request_duration_seconds',
'HTTP request latency',
['method', 'endpoint']
)
Grafana Dashboards
Datasource Configuration
# grafana/datasources/datasources.yml
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: false
- name: Loki
type: loki
access: proxy
url: http://loki:3100
editable: false
Dashboard Provisioning
# grafana/dashboards/dashboard.yml
apiVersion: 1
providers:
- name: 'SolarLog Dashboards'
orgId: 1
folder: ''
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: /etc/grafana/provisioning/dashboards
Solar-Log Dashboard JSON
{
"dashboard": {
"title": "Solar-Log Production Dashboard",
"panels": [
{
"title": "Total Power Production",
"targets": [
{
"expr": "sum(inverter_current_power_watts)"
}
],
"type": "graph"
},
{
"title": "Inverter Status",
"targets": [
{
"expr": "count(inverter_current_power_watts > 0)"
}
],
"type": "stat"
},
{
"title": "API Request Rate",
"targets": [
{
"expr": "rate(http_requests_total[5m])"
}
],
"type": "graph"
}
]
}
}
Logging with Loki
Add Loki & Promtail
# docker-compose.monitoring.yml (add to services)
loki:
image: grafana/loki:latest
container_name: solarlog-loki
restart: always
ports:
- "3100:3100"
volumes:
- ./loki/loki-config.yml:/etc/loki/local-config.yaml
- loki-data:/loki
networks:
- monitoring
promtail:
image: grafana/promtail:latest
container_name: solarlog-promtail
restart: always
volumes:
- /var/log:/var/log:ro
- ./promtail/promtail-config.yml:/etc/promtail/config.yml
- /var/lib/docker/containers:/var/lib/docker/containers:ro
command: -config.file=/etc/promtail/config.yml
networks:
- monitoring
depends_on:
- loki
Loki Configuration
# loki/loki-config.yml
auth_enabled: false
server:
http_listen_port: 3100
ingester:
lifecycler:
ring:
kvstore:
store: inmemory
replication_factor: 1
chunk_idle_period: 5m
chunk_retain_period: 30s
schema_config:
configs:
- from: 2024-01-01
store: boltdb
object_store: filesystem
schema: v11
index:
prefix: index_
period: 24h
storage_config:
boltdb:
directory: /loki/index
filesystem:
directory: /loki/chunks
limits_config:
enforce_metric_name: false
reject_old_samples: true
reject_old_samples_max_age: 168h
chunk_store_config:
max_look_back_period: 0s
table_manager:
retention_deletes_enabled: true
retention_period: 720h
Alerting
Alert Rules
# prometheus/rules/alerts.yml
groups:
- name: solarlog_alerts
interval: 30s
rules:
# Backend Down
- alert: BackendDown
expr: up{job="backend"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Backend service is down"
description: "Backend has been down for more than 1 minute"
# High Error Rate
- alert: HighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "High HTTP error rate"
description: "Error rate is {{ $value }} errors/sec"
# Database Connection Issues
- alert: DatabaseConnectionFailed
expr: pg_up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Cannot connect to PostgreSQL"
# High Memory Usage
- alert: HighMemoryUsage
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) < 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage"
description: "Less than 10% memory available"
# Disk Space Low
- alert: DiskSpaceLow
expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "Disk space running low"
description: "Less than 10% disk space available"
# Inverter Offline
- alert: InverterOffline
expr: inverter_current_power_watts == 0 AND hour() > 6 AND hour() < 20
for: 30m
labels:
severity: warning
annotations:
summary: "Inverter {{ $labels.inverter_id }} appears offline"
description: "No power production during daylight hours"
AlertManager Configuration
# alertmanager/alertmanager.yml
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.gmail.com:587'
smtp_from: 'alerts@solarlog.example.com'
smtp_auth_username: 'your-email@gmail.com'
smtp_auth_password: 'your-app-password'
route:
group_by: ['alertname', 'cluster']
group_wait: 10s
group_interval: 10s
repeat_interval: 12h
receiver: 'email'
routes:
- match:
severity: critical
receiver: 'email-critical'
receivers:
- name: 'email'
email_configs:
- to: 'admin@example.com'
headers:
Subject: '[SolarLog] Alert: {{ .GroupLabels.alertname }}'
- name: 'email-critical'
email_configs:
- to: 'admin@example.com,team@example.com'
headers:
Subject: '[CRITICAL] SolarLog Alert: {{ .GroupLabels.alertname }}'
- name: 'slack'
slack_configs:
- api_url: 'https://hooks.slack.com/services/YOUR/WEBHOOK/URL'
channel: '#solarlog-alerts'
title: 'SolarLog Alert'
text: '{{ .CommonAnnotations.description }}'
Usage
Start Monitoring Stack
# Start all monitoring services
docker compose -f docker-compose.monitoring.yml up -d
# Check status
docker compose -f docker-compose.monitoring.yml ps
# View logs
docker compose -f docker-compose.monitoring.yml logs -f grafana
Access Dashboards
- Grafana: http://localhost:3001 (admin/admin)
- Prometheus: http://localhost:9090
- AlertManager: http://localhost:9093
Import Dashboards
- Login to Grafana
- Go to Dashboards β Import
- Use Dashboard IDs:
- Node Exporter:
1860 - Docker:
893 - PostgreSQL:
9628 - Nginx:
12708
Troubleshooting
# Check Prometheus targets
curl http://localhost:9090/api/v1/targets
# Query metrics
curl 'http://localhost:9090/api/v1/query?query=up'
# Check Loki logs
docker compose -f docker-compose.monitoring.yml logs loki
# Restart services
docker compose -f docker-compose.monitoring.yml restart prometheus grafana