Files
smart-city-digital-twin-mar…/scripts/smartcity_monitor.py
Eric FELIXINE 83779cf5d7 fix: telegraf topics, mqtt brokers, docker-compose fixes
- Fix MOSQUITTO_HOST (wrong container name)
- Fix EMQX_PORT (1885 external -> 1883 internal)
- Fix telegraf MQTT topics (city/sensors/#)
- Fix BunkerM dynsec JSON
- Add kepler.yml Traefik config
- Update monitoring script
2026-06-07 20:18:41 -04:00

163 lines
5.5 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Smart City Digital Twin Martinique - Monitoring Script
Hybrid mode: Periodic checks + webhook-ready output
Alerts via Telegram when issues detected
Current stack (as of 2026-06-05):
- Analytics: Trino, StarRocks FE/BE, ClickHouse, Delta Lake, DuckDB, Streamlit
- FlexMeasures: Server, Worker, DB, Redis
- Airflow: Scheduler, Webserver, Postgres
- SmartApp: Web, API
- Gitea: Server, Runner
- Traefik: Reverse proxy
- Kepler: Geospatial visualization
"""
import subprocess
import json
import sys
from datetime import datetime
# Configuration - CURRENT RUNNING STACK
CRITICAL_CONTAINERS = [
# Analytics stack
"trino", "starrocks-fe", "starrocks-be", "clickhouse",
"delta-lake", "duckdb", "streamlit", "trino-nginx",
# FlexMeasures stack
"flexmeasures-server", "flexmeasures-worker", "flexmeasures-db", "flexmeasures-redis",
# Airflow stack
"airflow-scheduler", "airflow-webserver", "airflow-postgres",
# SmartApp
"smartapp-web", "smartapp-api",
# Gitea
"gitea", "gitea-runner",
# Infrastructure
"traefik",
"smart-city-kepler",
]
ENDPOINTS = [
# SmartApp
("SmartApp Web", "https://smartapp.digitribe.fr"),
("SmartApp API", "https://api-smartapp.digitribe.fr/health"),
# Analytics
("Trino", "https://trino.digitribe.fr"),
("Streamlit", "https://streamlit.digitribe.fr"),
("ClickHouse", "https://clickhouse.digitribe.fr"),
("StarRocks", "https://starrocks.digitribe.fr"),
("DuckDB", "https://duckdb.digitribe.fr"),
("Delta Lake", "https://deltalake.digitribe.fr"),
# FlexMeasures
("FlexMeasures", "https://flexmeasures.digitribe.fr"),
# Airflow
("Airflow", "https://airflow.digitribe.fr"),
# Gitea
("Gitea", "https://gitea.digitribe.fr"),
# Kepler
("Kepler", "https://kepler.digitribe.fr"),
]
# Endpoints known to have issues (documented)
KNOWN_ISSUES = {
"https://trino.digitribe.fr": "200/302 - Trino UI accessible at /ui/ (redirects to login)",
"https://kepler.digitribe.fr": "404 - no Traefik route configured for Kepler",
"https://starrocks.digitribe.fr": "502 - StarRocks FE HTTP port 8030 not ready (FE still starting up)",
}
TELEGRAM_USER = "@ericf972" # Will be used by Hermes send_message
def run_cmd(cmd):
"""Run shell command and return output"""
try:
result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=10)
return result.stdout.strip(), result.stderr.strip(), result.returncode
except Exception as e:
return "", str(e), 1
def check_containers():
"""Check if critical containers are running"""
issues = []
for container in CRITICAL_CONTAINERS:
cmd = f"docker ps --format '{{{{.Names}}}}' | grep -w '{container}'"
out, err, code = run_cmd(cmd)
if not out:
issues.append(f"🛑 Container DOWN: {container}")
return issues
def check_endpoints():
"""Check if key endpoints are accessible"""
issues = []
for name, url in ENDPOINTS:
cmd = f"curl -k -s -o /dev/null -w '%{{http_code}}' --connect-timeout 5 {url}"
out, err, code = run_cmd(cmd)
# Check if this is a known issue
if url in KNOWN_ISSUES:
issues.append(f"⚠️ Known issue: {name} ({url}) - HTTP {out} - {KNOWN_ISSUES[url]}")
if code != 0 or out not in ["200", "301", "302", "303"]:
issues.append(f"🌐 Endpoint DOWN: {name} ({url}) - HTTP {out}")
return issues
def check_network():
"""Check network connectivity between containers"""
issues = []
# Check if Traefik can reach key services
services = [
("trino", "trino:8080"),
("streamlit", "streamlit:8501"),
("clickhouse", "clickhouse:8123"),
("starrocks-fe", "starrocks-fe:8030"),
("flexmeasures-server", "flexmeasures-server:5000"),
("airflow-webserver", "airflow-webserver:8080"),
("smartapp-web", "smartapp-web:80"),
("gitea", "gitea:3000"),
]
for name, target in services:
cmd = f"docker exec traefik wget -q --spider http://{target} 2>&1"
out, err, code = run_cmd(cmd)
if code != 0:
issues.append(f"🔌 Network issue: Traefik → {name} ({target})")
return issues
def check_resources():
"""Check system resources"""
issues = []
# Disk space
cmd = "df -h / | awk 'NR==2 {print $5}' | tr -d '%'"
out, err, code = run_cmd(cmd)
if out and int(out) > 80:
issues.append(f"💾 Disk space critical: {out}% used")
# Memory
cmd = "free | awk '/Mem:/ {print int($3/$2 * 100)}'"
out, err, code = run_cmd(cmd)
if out and int(out) > 90:
issues.append(f"🧠 Memory critical: {out}% used")
return issues
def main():
"""Main monitoring function"""
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
all_issues = []
print(f"🔍 Smart City Monitoring Check - {timestamp}")
print("=" * 50)
# Run all checks
all_issues.extend(check_containers())
all_issues.extend(check_endpoints())
all_issues.extend(check_network())
all_issues.extend(check_resources())
# Output results
if all_issues:
print(f"⚠️ ALERT: {len(all_issues)} issue(s) detected!")
for issue in all_issues:
print(f" - {issue}")
# This output will be captured by Hermes cron job and sent to Telegram
sys.exit(1) # Non-zero exit code indicates issues
else:
print("✅ All systems operational")
sys.exit(0)
if __name__ == "__main__":
main()