Monitoring Setup

Overview

Comprehensive monitoring setup for Solar-Log Enterprise using Prometheus, Grafana, and Loki.

Architecture

┌─────────────────────────────────────────────────────────┐
│                     Grafana (Port 3001)                 │
│              Dashboards & Alerting                      │
└───────────────┬─────────────────────────────────────────┘
                │
    ┌───────────┴──────────┬──────────────────┐
    │                      │                  │
┌───▼────────┐  ┌─────────▼────────┐  ┌─────▼──────┐
│ Prometheus │  │      Loki        │  │  AlertMgr  │
│  (Metrics) │  │     (Logs)       │  │  (Alerts)  │
└───┬────────┘  └─────────┬────────┘  └────────────┘
    │                     │
    │           ┌─────────▼────────┐
    │           │    Promtail      │
    │           │  (Log Shipper)   │
    │           └──────────────────┘
    │
┌───▼─────────────────────────────────────────────────────┐
│              Application Services                        │
│  ┌──────────┐  ┌──────────┐  ┌──────────┐             │
│  │ Backend  │  │ Frontend │  │  Nginx   │             │
│  └──────────┘  └──────────┘  └──────────┘             │
└─────────────────────────────────────────────────────────┘

Prometheus Setup

docker-compose.monitoring.yml

version: '3.8'

services:
  prometheus:
    image: prom/prometheus:latest
    container_name: solarlog-prometheus
    restart: always
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--storage.tsdb.retention.time=30d'
    volumes:
      - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
      - prometheus-data:/prometheus
    ports:
      - "9090:9090"
    networks:
      - monitoring

  grafana:
    image: grafana/grafana:latest
    container_name: solarlog-grafana
    restart: always
    environment:
      - GF_SECURITY_ADMIN_USER=${GRAFANA_USER:-admin}
      - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD:-admin}
      - GF_USERS_ALLOW_SIGN_UP=false
    volumes:
      - grafana-data:/var/lib/grafana
      - ./grafana/dashboards:/etc/grafana/provisioning/dashboards:ro
      - ./grafana/datasources:/etc/grafana/provisioning/datasources:ro
    ports:
      - "3001:3000"
    networks:
      - monitoring
    depends_on:
      - prometheus

  node-exporter:
    image: prom/node-exporter:latest
    container_name: solarlog-node-exporter
    restart: always
    command:
      - '--path.procfs=/host/proc'
      - '--path.sysfs=/host/sys'
      - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
    ports:
      - "9100:9100"
    networks:
      - monitoring

volumes:
  prometheus-data:
  grafana-data:

networks:
  monitoring:
    driver: bridge

prometheus.yml

# prometheus/prometheus.yml
global:
  scrape_interval: 15s
  evaluation_interval: 15s
  external_labels:
    cluster: 'solarlog-production'
    environment: 'production'

# Alerting configuration
alerting:
  alertmanagers:
    - static_configs:
        - targets:
            - 'alertmanager:9093'

# Load rules
rule_files:
  - "rules/*.yml"

# Scrape configurations
scrape_configs:
  # Prometheus itself
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090']

  # Node Exporter (System Metrics)
  - job_name: 'node'
    static_configs:
      - targets: ['node-exporter:9100']
        labels:
          instance: 'solarlog-server'

  # Backend Application
  - job_name: 'backend'
    static_configs:
      - targets: ['backend:8000']
        labels:
          service: 'fastapi'
    metrics_path: '/metrics'

  # PostgreSQL Exporter
  - job_name: 'postgres'
    static_configs:
      - targets: ['postgres-exporter:9187']

  # Nginx Exporter
  - job_name: 'nginx'
    static_configs:
      - targets: ['nginx-exporter:9113']

  # Docker Containers
  - job_name: 'docker'
    static_configs:
      - targets: ['cadvisor:8080']

Backend Metrics Integration

Add Prometheus to FastAPI

# backend/requirements.txt
prometheus-fastapi-instrumentator==6.1.0
prometheus-client==0.19.0

# backend/app/main.py
from prometheus_fastapi_instrumentator import Instrumentator

app = FastAPI()

# Prometheus metrics
Instrumentator().instrument(app).expose(app, endpoint="/metrics")

# Custom metrics
from prometheus_client import Counter, Histogram, Gauge

inverter_poll_counter = Counter(
    'inverter_polls_total',
    'Total number of inverter polls',
    ['inverter_id', 'status']
)

inverter_power_gauge = Gauge(
    'inverter_current_power_watts',
    'Current power production in watts',
    ['inverter_id', 'manufacturer']
)

response_time_histogram = Histogram(
    'http_request_duration_seconds',
    'HTTP request latency',
    ['method', 'endpoint']
)

Grafana Dashboards

Datasource Configuration

# grafana/datasources/datasources.yml
apiVersion: 1

datasources:
  - name: Prometheus
    type: prometheus
    access: proxy
    url: http://prometheus:9090
    isDefault: true
    editable: false

  - name: Loki
    type: loki
    access: proxy
    url: http://loki:3100
    editable: false

Dashboard Provisioning

# grafana/dashboards/dashboard.yml
apiVersion: 1

providers:
  - name: 'SolarLog Dashboards'
    orgId: 1
    folder: ''
    type: file
    disableDeletion: false
    updateIntervalSeconds: 10
    allowUiUpdates: true
    options:
      path: /etc/grafana/provisioning/dashboards

Solar-Log Dashboard JSON

{
  "dashboard": {
    "title": "Solar-Log Production Dashboard",
    "panels": [
      {
        "title": "Total Power Production",
        "targets": [
          {
            "expr": "sum(inverter_current_power_watts)"
          }
        ],
        "type": "graph"
      },
      {
        "title": "Inverter Status",
        "targets": [
          {
            "expr": "count(inverter_current_power_watts > 0)"
          }
        ],
        "type": "stat"
      },
      {
        "title": "API Request Rate",
        "targets": [
          {
            "expr": "rate(http_requests_total[5m])"
          }
        ],
        "type": "graph"
      }
    ]
  }
}

Logging with Loki

Add Loki & Promtail

# docker-compose.monitoring.yml (add to services)
  loki:
    image: grafana/loki:latest
    container_name: solarlog-loki
    restart: always
    ports:
      - "3100:3100"
    volumes:
      - ./loki/loki-config.yml:/etc/loki/local-config.yaml
      - loki-data:/loki
    networks:
      - monitoring

  promtail:
    image: grafana/promtail:latest
    container_name: solarlog-promtail
    restart: always
    volumes:
      - /var/log:/var/log:ro
      - ./promtail/promtail-config.yml:/etc/promtail/config.yml
      - /var/lib/docker/containers:/var/lib/docker/containers:ro
    command: -config.file=/etc/promtail/config.yml
    networks:
      - monitoring
    depends_on:
      - loki

Loki Configuration

# loki/loki-config.yml
auth_enabled: false

server:
  http_listen_port: 3100

ingester:
  lifecycler:
    ring:
      kvstore:
        store: inmemory
      replication_factor: 1
  chunk_idle_period: 5m
  chunk_retain_period: 30s

schema_config:
  configs:
    - from: 2024-01-01
      store: boltdb
      object_store: filesystem
      schema: v11
      index:
        prefix: index_
        period: 24h

storage_config:
  boltdb:
    directory: /loki/index
  filesystem:
    directory: /loki/chunks

limits_config:
  enforce_metric_name: false
  reject_old_samples: true
  reject_old_samples_max_age: 168h

chunk_store_config:
  max_look_back_period: 0s

table_manager:
  retention_deletes_enabled: true
  retention_period: 720h

Alerting

Alert Rules

# prometheus/rules/alerts.yml
groups:
  - name: solarlog_alerts
    interval: 30s
    rules:
      # Backend Down
      - alert: BackendDown
        expr: up{job="backend"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Backend service is down"
          description: "Backend has been down for more than 1 minute"

      # High Error Rate
      - alert: HighErrorRate
        expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High HTTP error rate"
          description: "Error rate is {{ $value }} errors/sec"

      # Database Connection Issues
      - alert: DatabaseConnectionFailed
        expr: pg_up == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Cannot connect to PostgreSQL"

      # High Memory Usage
      - alert: HighMemoryUsage
        expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) < 0.1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage"
          description: "Less than 10% memory available"

      # Disk Space Low
      - alert: DiskSpaceLow
        expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Disk space running low"
          description: "Less than 10% disk space available"

      # Inverter Offline
      - alert: InverterOffline
        expr: inverter_current_power_watts == 0 AND hour() > 6 AND hour() < 20
        for: 30m
        labels:
          severity: warning
        annotations:
          summary: "Inverter {{ $labels.inverter_id }} appears offline"
          description: "No power production during daylight hours"

AlertManager Configuration

# alertmanager/alertmanager.yml
global:
  resolve_timeout: 5m
  smtp_smarthost: 'smtp.gmail.com:587'
  smtp_from: 'alerts@solarlog.example.com'
  smtp_auth_username: 'your-email@gmail.com'
  smtp_auth_password: 'your-app-password'

route:
  group_by: ['alertname', 'cluster']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 12h
  receiver: 'email'
  routes:
    - match:
        severity: critical
      receiver: 'email-critical'

receivers:
  - name: 'email'
    email_configs:
      - to: 'admin@example.com'
        headers:
          Subject: '[SolarLog] Alert: {{ .GroupLabels.alertname }}'

  - name: 'email-critical'
    email_configs:
      - to: 'admin@example.com,team@example.com'
        headers:
          Subject: '[CRITICAL] SolarLog Alert: {{ .GroupLabels.alertname }}'

  - name: 'slack'
    slack_configs:
      - api_url: 'https://hooks.slack.com/services/YOUR/WEBHOOK/URL'
        channel: '#solarlog-alerts'
        title: 'SolarLog Alert'
        text: '{{ .CommonAnnotations.description }}'

Usage

Start Monitoring Stack

# Start all monitoring services
docker compose -f docker-compose.monitoring.yml up -d

# Check status
docker compose -f docker-compose.monitoring.yml ps

# View logs
docker compose -f docker-compose.monitoring.yml logs -f grafana

Access Dashboards

Grafana: http://localhost:3001 (admin/admin)
Prometheus: http://localhost:9090
AlertManager: http://localhost:9093

Import Dashboards

Login to Grafana
Go to Dashboards → Import
Use Dashboard IDs:
Node Exporter: 1860
Docker: 893
PostgreSQL: 9628
Nginx: 12708

Troubleshooting

# Check Prometheus targets
curl http://localhost:9090/api/v1/targets

# Query metrics
curl 'http://localhost:9090/api/v1/query?query=up'

# Check Loki logs
docker compose -f docker-compose.monitoring.yml logs loki

# Restart services
docker compose -f docker-compose.monitoring.yml restart prometheus grafana

Monitoring Setup

Overview

Architecture

Prometheus Setup

docker-compose.monitoring.yml

prometheus.yml

Backend Metrics Integration

Add Prometheus to FastAPI

Grafana Dashboards

Datasource Configuration

Dashboard Provisioning

Solar-Log Dashboard JSON

Logging with Loki

Add Loki & Promtail

Loki Configuration

Alerting

Alert Rules

AlertManager Configuration

Usage

Start Monitoring Stack

Access Dashboards

Import Dashboards

Troubleshooting

Next Steps