- Fix MOSQUITTO_HOST (wrong container name) - Fix EMQX_PORT (1885 external -> 1883 internal) - Fix telegraf MQTT topics (city/sensors/#) - Fix BunkerM dynsec JSON - Add kepler.yml Traefik config - Update monitoring script
163 lines
5.5 KiB
Python
Executable File
163 lines
5.5 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Smart City Digital Twin Martinique - Monitoring Script
|
|
Hybrid mode: Periodic checks + webhook-ready output
|
|
Alerts via Telegram when issues detected
|
|
|
|
Current stack (as of 2026-06-05):
|
|
- Analytics: Trino, StarRocks FE/BE, ClickHouse, Delta Lake, DuckDB, Streamlit
|
|
- FlexMeasures: Server, Worker, DB, Redis
|
|
- Airflow: Scheduler, Webserver, Postgres
|
|
- SmartApp: Web, API
|
|
- Gitea: Server, Runner
|
|
- Traefik: Reverse proxy
|
|
- Kepler: Geospatial visualization
|
|
"""
|
|
|
|
import subprocess
|
|
import json
|
|
import sys
|
|
from datetime import datetime
|
|
|
|
# Configuration - CURRENT RUNNING STACK
|
|
CRITICAL_CONTAINERS = [
|
|
# Analytics stack
|
|
"trino", "starrocks-fe", "starrocks-be", "clickhouse",
|
|
"delta-lake", "duckdb", "streamlit", "trino-nginx",
|
|
# FlexMeasures stack
|
|
"flexmeasures-server", "flexmeasures-worker", "flexmeasures-db", "flexmeasures-redis",
|
|
# Airflow stack
|
|
"airflow-scheduler", "airflow-webserver", "airflow-postgres",
|
|
# SmartApp
|
|
"smartapp-web", "smartapp-api",
|
|
# Gitea
|
|
"gitea", "gitea-runner",
|
|
# Infrastructure
|
|
"traefik",
|
|
"smart-city-kepler",
|
|
]
|
|
|
|
ENDPOINTS = [
|
|
# SmartApp
|
|
("SmartApp Web", "https://smartapp.digitribe.fr"),
|
|
("SmartApp API", "https://api-smartapp.digitribe.fr/health"),
|
|
# Analytics
|
|
("Trino", "https://trino.digitribe.fr"),
|
|
("Streamlit", "https://streamlit.digitribe.fr"),
|
|
("ClickHouse", "https://clickhouse.digitribe.fr"),
|
|
("StarRocks", "https://starrocks.digitribe.fr"),
|
|
("DuckDB", "https://duckdb.digitribe.fr"),
|
|
("Delta Lake", "https://deltalake.digitribe.fr"),
|
|
# FlexMeasures
|
|
("FlexMeasures", "https://flexmeasures.digitribe.fr"),
|
|
# Airflow
|
|
("Airflow", "https://airflow.digitribe.fr"),
|
|
# Gitea
|
|
("Gitea", "https://gitea.digitribe.fr"),
|
|
# Kepler
|
|
("Kepler", "https://kepler.digitribe.fr"),
|
|
]
|
|
|
|
# Endpoints known to have issues (documented)
|
|
KNOWN_ISSUES = {
|
|
"https://trino.digitribe.fr": "200/302 - Trino UI accessible at /ui/ (redirects to login)",
|
|
"https://kepler.digitribe.fr": "404 - no Traefik route configured for Kepler",
|
|
"https://starrocks.digitribe.fr": "502 - StarRocks FE HTTP port 8030 not ready (FE still starting up)",
|
|
}
|
|
|
|
TELEGRAM_USER = "@ericf972" # Will be used by Hermes send_message
|
|
|
|
def run_cmd(cmd):
|
|
"""Run shell command and return output"""
|
|
try:
|
|
result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=10)
|
|
return result.stdout.strip(), result.stderr.strip(), result.returncode
|
|
except Exception as e:
|
|
return "", str(e), 1
|
|
|
|
def check_containers():
|
|
"""Check if critical containers are running"""
|
|
issues = []
|
|
for container in CRITICAL_CONTAINERS:
|
|
cmd = f"docker ps --format '{{{{.Names}}}}' | grep -w '{container}'"
|
|
out, err, code = run_cmd(cmd)
|
|
if not out:
|
|
issues.append(f"🛑 Container DOWN: {container}")
|
|
return issues
|
|
|
|
def check_endpoints():
|
|
"""Check if key endpoints are accessible"""
|
|
issues = []
|
|
for name, url in ENDPOINTS:
|
|
cmd = f"curl -k -s -o /dev/null -w '%{{http_code}}' --connect-timeout 5 {url}"
|
|
out, err, code = run_cmd(cmd)
|
|
# Check if this is a known issue
|
|
if url in KNOWN_ISSUES:
|
|
issues.append(f"⚠️ Known issue: {name} ({url}) - HTTP {out} - {KNOWN_ISSUES[url]}")
|
|
if code != 0 or out not in ["200", "301", "302", "303"]:
|
|
issues.append(f"🌐 Endpoint DOWN: {name} ({url}) - HTTP {out}")
|
|
return issues
|
|
|
|
def check_network():
|
|
"""Check network connectivity between containers"""
|
|
issues = []
|
|
# Check if Traefik can reach key services
|
|
services = [
|
|
("trino", "trino:8080"),
|
|
("streamlit", "streamlit:8501"),
|
|
("clickhouse", "clickhouse:8123"),
|
|
("starrocks-fe", "starrocks-fe:8030"),
|
|
("flexmeasures-server", "flexmeasures-server:5000"),
|
|
("airflow-webserver", "airflow-webserver:8080"),
|
|
("smartapp-web", "smartapp-web:80"),
|
|
("gitea", "gitea:3000"),
|
|
]
|
|
for name, target in services:
|
|
cmd = f"docker exec traefik wget -q --spider http://{target} 2>&1"
|
|
out, err, code = run_cmd(cmd)
|
|
if code != 0:
|
|
issues.append(f"🔌 Network issue: Traefik → {name} ({target})")
|
|
return issues
|
|
|
|
def check_resources():
|
|
"""Check system resources"""
|
|
issues = []
|
|
# Disk space
|
|
cmd = "df -h / | awk 'NR==2 {print $5}' | tr -d '%'"
|
|
out, err, code = run_cmd(cmd)
|
|
if out and int(out) > 80:
|
|
issues.append(f"💾 Disk space critical: {out}% used")
|
|
# Memory
|
|
cmd = "free | awk '/Mem:/ {print int($3/$2 * 100)}'"
|
|
out, err, code = run_cmd(cmd)
|
|
if out and int(out) > 90:
|
|
issues.append(f"🧠 Memory critical: {out}% used")
|
|
return issues
|
|
|
|
def main():
|
|
"""Main monitoring function"""
|
|
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
all_issues = []
|
|
|
|
print(f"🔍 Smart City Monitoring Check - {timestamp}")
|
|
print("=" * 50)
|
|
|
|
# Run all checks
|
|
all_issues.extend(check_containers())
|
|
all_issues.extend(check_endpoints())
|
|
all_issues.extend(check_network())
|
|
all_issues.extend(check_resources())
|
|
|
|
# Output results
|
|
if all_issues:
|
|
print(f"⚠️ ALERT: {len(all_issues)} issue(s) detected!")
|
|
for issue in all_issues:
|
|
print(f" - {issue}")
|
|
# This output will be captured by Hermes cron job and sent to Telegram
|
|
sys.exit(1) # Non-zero exit code indicates issues
|
|
else:
|
|
print("✅ All systems operational")
|
|
sys.exit(0)
|
|
|
|
if __name__ == "__main__":
|
|
main() |