fix: telegraf topics, mqtt brokers, docker-compose fixes

- Fix MOSQUITTO_HOST (wrong container name)
- Fix EMQX_PORT (1885 external -> 1883 internal)
- Fix telegraf MQTT topics (city/sensors/#)
- Fix BunkerM dynsec JSON
- Add kepler.yml Traefik config
- Update monitoring script
This commit is contained in:
Eric FELIXINE
2026-06-07 20:18:41 -04:00
parent 7c0cb330d9
commit 83779cf5d7
9 changed files with 135 additions and 69 deletions

View File

@@ -3,6 +3,15 @@
Smart City Digital Twin Martinique - Monitoring Script
Hybrid mode: Periodic checks + webhook-ready output
Alerts via Telegram when issues detected
Current stack (as of 2026-06-05):
- Analytics: Trino, StarRocks FE/BE, ClickHouse, Delta Lake, DuckDB, Streamlit
- FlexMeasures: Server, Worker, DB, Redis
- Airflow: Scheduler, Webserver, Postgres
- SmartApp: Web, API
- Gitea: Server, Runner
- Traefik: Reverse proxy
- Kepler: Geospatial visualization
"""
import subprocess
@@ -10,23 +19,52 @@ import json
import sys
from datetime import datetime
# Configuration
# Configuration - CURRENT RUNNING STACK
CRITICAL_CONTAINERS = [
"openremote-manager", "openremote-keycloak", "smart-city-simulator",
"emqx_emqx_1", "mainfluxlabs-broker", "stellio-api-gateway",
"smart-city-influxdb", "smart-city-grafana", "traefik",
"smart-city-prometheus-brokers"
# Analytics stack
"trino", "starrocks-fe", "starrocks-be", "clickhouse",
"delta-lake", "duckdb", "streamlit", "trino-nginx",
# FlexMeasures stack
"flexmeasures-server", "flexmeasures-worker", "flexmeasures-db", "flexmeasures-redis",
# Airflow stack
"airflow-scheduler", "airflow-webserver", "airflow-postgres",
# SmartApp
"smartapp-web", "smartapp-api",
# Gitea
"gitea", "gitea-runner",
# Infrastructure
"traefik",
"smart-city-kepler",
]
ENDPOINTS = [
("OpenRemote", "https://openremote.digitribe.fr"),
("Grafana", "https://grafana.digitribe.fr"),
("Orion-LD", "http://fiware-gis-quickstart-orion-1:1026/version"),
("Stellio", "https://stellio.digitribe.fr"),
("FROST", "http://frost_http-web-1:8080/FROST-Server/core/v1.0/info")
# SmartApp
("SmartApp Web", "https://smartapp.digitribe.fr"),
("SmartApp API", "https://api-smartapp.digitribe.fr/health"),
# Analytics
("Trino", "https://trino.digitribe.fr"),
("Streamlit", "https://streamlit.digitribe.fr"),
("ClickHouse", "https://clickhouse.digitribe.fr"),
("StarRocks", "https://starrocks.digitribe.fr"),
("DuckDB", "https://duckdb.digitribe.fr"),
("Delta Lake", "https://deltalake.digitribe.fr"),
# FlexMeasures
("FlexMeasures", "https://flexmeasures.digitribe.fr"),
# Airflow
("Airflow", "https://airflow.digitribe.fr"),
# Gitea
("Gitea", "https://gitea.digitribe.fr"),
# Kepler
("Kepler", "https://kepler.digitribe.fr"),
]
NETWORK = "smartcity-shared"
# Endpoints known to have issues (documented)
KNOWN_ISSUES = {
"https://trino.digitribe.fr": "200/302 - Trino UI accessible at /ui/ (redirects to login)",
"https://kepler.digitribe.fr": "404 - no Traefik route configured for Kepler",
"https://starrocks.digitribe.fr": "502 - StarRocks FE HTTP port 8030 not ready (FE still starting up)",
}
TELEGRAM_USER = "@ericf972" # Will be used by Hermes send_message
def run_cmd(cmd):
@@ -53,18 +91,32 @@ def check_endpoints():
for name, url in ENDPOINTS:
cmd = f"curl -k -s -o /dev/null -w '%{{http_code}}' --connect-timeout 5 {url}"
out, err, code = run_cmd(cmd)
if code != 0 or out not in ["200", "301", "302"]:
# Check if this is a known issue
if url in KNOWN_ISSUES:
issues.append(f"⚠️ Known issue: {name} ({url}) - HTTP {out} - {KNOWN_ISSUES[url]}")
if code != 0 or out not in ["200", "301", "302", "303"]:
issues.append(f"🌐 Endpoint DOWN: {name} ({url}) - HTTP {out}")
return issues
def check_network():
"""Check network connectivity between containers"""
issues = []
# Check if Traefik can reach OpenRemote
cmd = "docker exec traefik wget -q --spider http://openremote_manager_1:8080 2>&1"
out, err, code = run_cmd(cmd)
if code != 0:
issues.append(f"🔌 Network issue: Traefik → OpenRemote")
# Check if Traefik can reach key services
services = [
("trino", "trino:8080"),
("streamlit", "streamlit:8501"),
("clickhouse", "clickhouse:8123"),
("starrocks-fe", "starrocks-fe:8030"),
("flexmeasures-server", "flexmeasures-server:5000"),
("airflow-webserver", "airflow-webserver:8080"),
("smartapp-web", "smartapp-web:80"),
("gitea", "gitea:3000"),
]
for name, target in services:
cmd = f"docker exec traefik wget -q --spider http://{target} 2>&1"
out, err, code = run_cmd(cmd)
if code != 0:
issues.append(f"🔌 Network issue: Traefik → {name} ({target})")
return issues
def check_resources():
@@ -86,16 +138,16 @@ def main():
"""Main monitoring function"""
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
all_issues = []
print(f"🔍 Smart City Monitoring Check - {timestamp}")
print("=" * 50)
# Run all checks
all_issues.extend(check_containers())
all_issues.extend(check_endpoints())
all_issues.extend(check_network())
all_issues.extend(check_resources())
# Output results
if all_issues:
print(f"⚠️ ALERT: {len(all_issues)} issue(s) detected!")
@@ -108,4 +160,4 @@ def main():
sys.exit(0)
if __name__ == "__main__":
main()
main()