#!/usr/bin/env python3 """ Smart City Digital Twin Martinique - Monitoring Script Hybrid mode: Periodic checks + webhook-ready output Alerts via Telegram when issues detected Current stack (as of 2026-06-05): - Analytics: Trino, StarRocks FE/BE, ClickHouse, Delta Lake, DuckDB, Streamlit - FlexMeasures: Server, Worker, DB, Redis - Airflow: Scheduler, Webserver, Postgres - SmartApp: Web, API - Gitea: Server, Runner - Traefik: Reverse proxy - Kepler: Geospatial visualization """ import subprocess import json import sys from datetime import datetime # Configuration - CURRENT RUNNING STACK CRITICAL_CONTAINERS = [ # Analytics stack "trino", "starrocks-fe", "starrocks-be", "clickhouse", "delta-lake", "duckdb", "streamlit", "trino-nginx", # FlexMeasures stack "flexmeasures-server", "flexmeasures-worker", "flexmeasures-db", "flexmeasures-redis", # Airflow stack "airflow-scheduler", "airflow-webserver", "airflow-postgres", # SmartApp "smartapp-web", "smartapp-api", # Gitea "gitea", "gitea-runner", # Infrastructure "traefik", "smart-city-kepler", ] ENDPOINTS = [ # SmartApp ("SmartApp Web", "https://smartapp.digitribe.fr"), ("SmartApp API", "https://api-smartapp.digitribe.fr/health"), # Analytics ("Trino", "https://trino.digitribe.fr"), ("Streamlit", "https://streamlit.digitribe.fr"), ("ClickHouse", "https://clickhouse.digitribe.fr"), ("StarRocks", "https://starrocks.digitribe.fr"), ("DuckDB", "https://duckdb.digitribe.fr"), ("Delta Lake", "https://deltalake.digitribe.fr"), # FlexMeasures ("FlexMeasures", "https://flexmeasures.digitribe.fr"), # Airflow ("Airflow", "https://airflow.digitribe.fr"), # Gitea ("Gitea", "https://gitea.digitribe.fr"), # Kepler ("Kepler", "https://kepler.digitribe.fr"), ] # Endpoints known to have issues (documented) KNOWN_ISSUES = { "https://trino.digitribe.fr": "200/302 - Trino UI accessible at /ui/ (redirects to login)", "https://kepler.digitribe.fr": "404 - no Traefik route configured for Kepler", "https://starrocks.digitribe.fr": "502 - StarRocks FE HTTP port 8030 not ready (FE still starting up)", } TELEGRAM_USER = "@ericf972" # Will be used by Hermes send_message def run_cmd(cmd): """Run shell command and return output""" try: result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=10) return result.stdout.strip(), result.stderr.strip(), result.returncode except Exception as e: return "", str(e), 1 def check_containers(): """Check if critical containers are running""" issues = [] for container in CRITICAL_CONTAINERS: cmd = f"docker ps --format '{{{{.Names}}}}' | grep -w '{container}'" out, err, code = run_cmd(cmd) if not out: issues.append(f"🛑 Container DOWN: {container}") return issues def check_endpoints(): """Check if key endpoints are accessible""" issues = [] for name, url in ENDPOINTS: cmd = f"curl -k -s -o /dev/null -w '%{{http_code}}' --connect-timeout 5 {url}" out, err, code = run_cmd(cmd) # Check if this is a known issue if url in KNOWN_ISSUES: issues.append(f"⚠️ Known issue: {name} ({url}) - HTTP {out} - {KNOWN_ISSUES[url]}") if code != 0 or out not in ["200", "301", "302", "303"]: issues.append(f"🌐 Endpoint DOWN: {name} ({url}) - HTTP {out}") return issues def check_network(): """Check network connectivity between containers""" issues = [] # Check if Traefik can reach key services services = [ ("trino", "trino:8080"), ("streamlit", "streamlit:8501"), ("clickhouse", "clickhouse:8123"), ("starrocks-fe", "starrocks-fe:8030"), ("flexmeasures-server", "flexmeasures-server:5000"), ("airflow-webserver", "airflow-webserver:8080"), ("smartapp-web", "smartapp-web:80"), ("gitea", "gitea:3000"), ] for name, target in services: cmd = f"docker exec traefik wget -q --spider http://{target} 2>&1" out, err, code = run_cmd(cmd) if code != 0: issues.append(f"🔌 Network issue: Traefik → {name} ({target})") return issues def check_resources(): """Check system resources""" issues = [] # Disk space cmd = "df -h / | awk 'NR==2 {print $5}' | tr -d '%'" out, err, code = run_cmd(cmd) if out and int(out) > 80: issues.append(f"💾 Disk space critical: {out}% used") # Memory cmd = "free | awk '/Mem:/ {print int($3/$2 * 100)}'" out, err, code = run_cmd(cmd) if out and int(out) > 90: issues.append(f"🧠 Memory critical: {out}% used") return issues def main(): """Main monitoring function""" timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") all_issues = [] print(f"🔍 Smart City Monitoring Check - {timestamp}") print("=" * 50) # Run all checks all_issues.extend(check_containers()) all_issues.extend(check_endpoints()) all_issues.extend(check_network()) all_issues.extend(check_resources()) # Output results if all_issues: print(f"⚠️ ALERT: {len(all_issues)} issue(s) detected!") for issue in all_issues: print(f" - {issue}") # This output will be captured by Hermes cron job and sent to Telegram sys.exit(1) # Non-zero exit code indicates issues else: print("✅ All systems operational") sys.exit(0) if __name__ == "__main__": main()