Summary of changes: - JupyterHub: fix DB path (absolute), Dockerfile cleanup, SimpleLocalProcessSpawner - JupyterHub: user eric created as admin - Hermes Dashboard WebUI + TUI chat service (systemd, localhost:9119, auto-boot) - OR mbtiles: generated Martinique PNG tiles (5690 tiles, 10.9MB) — needs PBF for OR - OR mbtiles: restored original PBF with corrected metadata (world bounds, Martinique center) - OR mapsettings: verified center=[-61,14.5], bounds=Martinique, minZoom=0 - Trino: added node.properties (node.environment=production) — needs restart - TODO.md: updated with current state - session_resume_consolide.md: created (per-session summary)
112 lines
3.6 KiB
Python
Executable File
112 lines
3.6 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Smart City Digital Twin Martinique - Monitoring Script
|
|
Hybrid mode: Periodic checks + webhook-ready output
|
|
Alerts via Telegram when issues detected
|
|
"""
|
|
|
|
import subprocess
|
|
import json
|
|
import sys
|
|
from datetime import datetime
|
|
|
|
# Configuration
|
|
CRITICAL_CONTAINERS = [
|
|
"openremote-manager", "openremote-keycloak", "smart-city-simulator",
|
|
"emqx_emqx_1", "mainfluxlabs-broker", "stellio-api-gateway",
|
|
"smart-city-influxdb", "smart-city-grafana", "traefik",
|
|
"smart-city-prometheus-brokers"
|
|
]
|
|
|
|
ENDPOINTS = [
|
|
("OpenRemote", "https://openremote.digitribe.fr"),
|
|
("Grafana", "https://grafana.digitribe.fr"),
|
|
("Orion-LD", "http://fiware-gis-quickstart-orion-1:1026/version"),
|
|
("Stellio", "https://stellio.digitribe.fr"),
|
|
("FROST", "http://frost_http-web-1:8080/FROST-Server/core/v1.0/info")
|
|
]
|
|
|
|
NETWORK = "smartcity-shared"
|
|
TELEGRAM_USER = "@ericf972" # Will be used by Hermes send_message
|
|
|
|
def run_cmd(cmd):
|
|
"""Run shell command and return output"""
|
|
try:
|
|
result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=10)
|
|
return result.stdout.strip(), result.stderr.strip(), result.returncode
|
|
except Exception as e:
|
|
return "", str(e), 1
|
|
|
|
def check_containers():
|
|
"""Check if critical containers are running"""
|
|
issues = []
|
|
for container in CRITICAL_CONTAINERS:
|
|
cmd = f"docker ps --format '{{{{.Names}}}}' | grep -w '{container}'"
|
|
out, err, code = run_cmd(cmd)
|
|
if not out:
|
|
issues.append(f"🛑 Container DOWN: {container}")
|
|
return issues
|
|
|
|
def check_endpoints():
|
|
"""Check if key endpoints are accessible"""
|
|
issues = []
|
|
for name, url in ENDPOINTS:
|
|
cmd = f"curl -k -s -o /dev/null -w '%{{http_code}}' --connect-timeout 5 {url}"
|
|
out, err, code = run_cmd(cmd)
|
|
if code != 0 or out not in ["200", "301", "302"]:
|
|
issues.append(f"🌐 Endpoint DOWN: {name} ({url}) - HTTP {out}")
|
|
return issues
|
|
|
|
def check_network():
|
|
"""Check network connectivity between containers"""
|
|
issues = []
|
|
# Check if Traefik can reach OpenRemote
|
|
cmd = "docker exec traefik wget -q --spider http://openremote_manager_1:8080 2>&1"
|
|
out, err, code = run_cmd(cmd)
|
|
if code != 0:
|
|
issues.append(f"🔌 Network issue: Traefik → OpenRemote")
|
|
return issues
|
|
|
|
def check_resources():
|
|
"""Check system resources"""
|
|
issues = []
|
|
# Disk space
|
|
cmd = "df -h / | awk 'NR==2 {print $5}' | tr -d '%'"
|
|
out, err, code = run_cmd(cmd)
|
|
if out and int(out) > 80:
|
|
issues.append(f"💾 Disk space critical: {out}% used")
|
|
# Memory
|
|
cmd = "free | awk '/Mem:/ {print int($3/$2 * 100)}'"
|
|
out, err, code = run_cmd(cmd)
|
|
if out and int(out) > 90:
|
|
issues.append(f"🧠 Memory critical: {out}% used")
|
|
return issues
|
|
|
|
def main():
|
|
"""Main monitoring function"""
|
|
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
all_issues = []
|
|
|
|
print(f"🔍 Smart City Monitoring Check - {timestamp}")
|
|
print("=" * 50)
|
|
|
|
# Run all checks
|
|
all_issues.extend(check_containers())
|
|
all_issues.extend(check_endpoints())
|
|
all_issues.extend(check_network())
|
|
all_issues.extend(check_resources())
|
|
|
|
# Output results
|
|
if all_issues:
|
|
print(f"⚠️ ALERT: {len(all_issues)} issue(s) detected!")
|
|
for issue in all_issues:
|
|
print(f" - {issue}")
|
|
# This output will be captured by Hermes cron job and sent to Telegram
|
|
sys.exit(1) # Non-zero exit code indicates issues
|
|
else:
|
|
print("✅ All systems operational")
|
|
sys.exit(0)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|