fix: telegraf topics, mqtt brokers, docker-compose fixes
- Fix MOSQUITTO_HOST (wrong container name) - Fix EMQX_PORT (1885 external -> 1883 internal) - Fix telegraf MQTT topics (city/sensors/#) - Fix BunkerM dynsec JSON - Add kepler.yml Traefik config - Update monitoring script
This commit is contained in:
@@ -3,6 +3,15 @@
|
||||
Smart City Digital Twin Martinique - Monitoring Script
|
||||
Hybrid mode: Periodic checks + webhook-ready output
|
||||
Alerts via Telegram when issues detected
|
||||
|
||||
Current stack (as of 2026-06-05):
|
||||
- Analytics: Trino, StarRocks FE/BE, ClickHouse, Delta Lake, DuckDB, Streamlit
|
||||
- FlexMeasures: Server, Worker, DB, Redis
|
||||
- Airflow: Scheduler, Webserver, Postgres
|
||||
- SmartApp: Web, API
|
||||
- Gitea: Server, Runner
|
||||
- Traefik: Reverse proxy
|
||||
- Kepler: Geospatial visualization
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
@@ -10,23 +19,52 @@ import json
|
||||
import sys
|
||||
from datetime import datetime
|
||||
|
||||
# Configuration
|
||||
# Configuration - CURRENT RUNNING STACK
|
||||
CRITICAL_CONTAINERS = [
|
||||
"openremote-manager", "openremote-keycloak", "smart-city-simulator",
|
||||
"emqx_emqx_1", "mainfluxlabs-broker", "stellio-api-gateway",
|
||||
"smart-city-influxdb", "smart-city-grafana", "traefik",
|
||||
"smart-city-prometheus-brokers"
|
||||
# Analytics stack
|
||||
"trino", "starrocks-fe", "starrocks-be", "clickhouse",
|
||||
"delta-lake", "duckdb", "streamlit", "trino-nginx",
|
||||
# FlexMeasures stack
|
||||
"flexmeasures-server", "flexmeasures-worker", "flexmeasures-db", "flexmeasures-redis",
|
||||
# Airflow stack
|
||||
"airflow-scheduler", "airflow-webserver", "airflow-postgres",
|
||||
# SmartApp
|
||||
"smartapp-web", "smartapp-api",
|
||||
# Gitea
|
||||
"gitea", "gitea-runner",
|
||||
# Infrastructure
|
||||
"traefik",
|
||||
"smart-city-kepler",
|
||||
]
|
||||
|
||||
ENDPOINTS = [
|
||||
("OpenRemote", "https://openremote.digitribe.fr"),
|
||||
("Grafana", "https://grafana.digitribe.fr"),
|
||||
("Orion-LD", "http://fiware-gis-quickstart-orion-1:1026/version"),
|
||||
("Stellio", "https://stellio.digitribe.fr"),
|
||||
("FROST", "http://frost_http-web-1:8080/FROST-Server/core/v1.0/info")
|
||||
# SmartApp
|
||||
("SmartApp Web", "https://smartapp.digitribe.fr"),
|
||||
("SmartApp API", "https://api-smartapp.digitribe.fr/health"),
|
||||
# Analytics
|
||||
("Trino", "https://trino.digitribe.fr"),
|
||||
("Streamlit", "https://streamlit.digitribe.fr"),
|
||||
("ClickHouse", "https://clickhouse.digitribe.fr"),
|
||||
("StarRocks", "https://starrocks.digitribe.fr"),
|
||||
("DuckDB", "https://duckdb.digitribe.fr"),
|
||||
("Delta Lake", "https://deltalake.digitribe.fr"),
|
||||
# FlexMeasures
|
||||
("FlexMeasures", "https://flexmeasures.digitribe.fr"),
|
||||
# Airflow
|
||||
("Airflow", "https://airflow.digitribe.fr"),
|
||||
# Gitea
|
||||
("Gitea", "https://gitea.digitribe.fr"),
|
||||
# Kepler
|
||||
("Kepler", "https://kepler.digitribe.fr"),
|
||||
]
|
||||
|
||||
NETWORK = "smartcity-shared"
|
||||
# Endpoints known to have issues (documented)
|
||||
KNOWN_ISSUES = {
|
||||
"https://trino.digitribe.fr": "200/302 - Trino UI accessible at /ui/ (redirects to login)",
|
||||
"https://kepler.digitribe.fr": "404 - no Traefik route configured for Kepler",
|
||||
"https://starrocks.digitribe.fr": "502 - StarRocks FE HTTP port 8030 not ready (FE still starting up)",
|
||||
}
|
||||
|
||||
TELEGRAM_USER = "@ericf972" # Will be used by Hermes send_message
|
||||
|
||||
def run_cmd(cmd):
|
||||
@@ -53,18 +91,32 @@ def check_endpoints():
|
||||
for name, url in ENDPOINTS:
|
||||
cmd = f"curl -k -s -o /dev/null -w '%{{http_code}}' --connect-timeout 5 {url}"
|
||||
out, err, code = run_cmd(cmd)
|
||||
if code != 0 or out not in ["200", "301", "302"]:
|
||||
# Check if this is a known issue
|
||||
if url in KNOWN_ISSUES:
|
||||
issues.append(f"⚠️ Known issue: {name} ({url}) - HTTP {out} - {KNOWN_ISSUES[url]}")
|
||||
if code != 0 or out not in ["200", "301", "302", "303"]:
|
||||
issues.append(f"🌐 Endpoint DOWN: {name} ({url}) - HTTP {out}")
|
||||
return issues
|
||||
|
||||
def check_network():
|
||||
"""Check network connectivity between containers"""
|
||||
issues = []
|
||||
# Check if Traefik can reach OpenRemote
|
||||
cmd = "docker exec traefik wget -q --spider http://openremote_manager_1:8080 2>&1"
|
||||
out, err, code = run_cmd(cmd)
|
||||
if code != 0:
|
||||
issues.append(f"🔌 Network issue: Traefik → OpenRemote")
|
||||
# Check if Traefik can reach key services
|
||||
services = [
|
||||
("trino", "trino:8080"),
|
||||
("streamlit", "streamlit:8501"),
|
||||
("clickhouse", "clickhouse:8123"),
|
||||
("starrocks-fe", "starrocks-fe:8030"),
|
||||
("flexmeasures-server", "flexmeasures-server:5000"),
|
||||
("airflow-webserver", "airflow-webserver:8080"),
|
||||
("smartapp-web", "smartapp-web:80"),
|
||||
("gitea", "gitea:3000"),
|
||||
]
|
||||
for name, target in services:
|
||||
cmd = f"docker exec traefik wget -q --spider http://{target} 2>&1"
|
||||
out, err, code = run_cmd(cmd)
|
||||
if code != 0:
|
||||
issues.append(f"🔌 Network issue: Traefik → {name} ({target})")
|
||||
return issues
|
||||
|
||||
def check_resources():
|
||||
@@ -86,16 +138,16 @@ def main():
|
||||
"""Main monitoring function"""
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
all_issues = []
|
||||
|
||||
|
||||
print(f"🔍 Smart City Monitoring Check - {timestamp}")
|
||||
print("=" * 50)
|
||||
|
||||
|
||||
# Run all checks
|
||||
all_issues.extend(check_containers())
|
||||
all_issues.extend(check_endpoints())
|
||||
all_issues.extend(check_network())
|
||||
all_issues.extend(check_resources())
|
||||
|
||||
|
||||
# Output results
|
||||
if all_issues:
|
||||
print(f"⚠️ ALERT: {len(all_issues)} issue(s) detected!")
|
||||
@@ -108,4 +160,4 @@ def main():
|
||||
sys.exit(0)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
main()
|
||||
Reference in New Issue
Block a user