Session 2026-05-06: QuantumLeap+CrateDB, Telegraf debug, MapStore GeoServer fix
This commit is contained in:
111
scripts/smartcity_monitor.py
Executable file
111
scripts/smartcity_monitor.py
Executable file
@@ -0,0 +1,111 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Smart City Digital Twin Martinique - Monitoring Script
|
||||
Hybrid mode: Periodic checks + webhook-ready output
|
||||
Alerts via Telegram when issues detected
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
import json
|
||||
import sys
|
||||
from datetime import datetime
|
||||
|
||||
# Configuration
|
||||
CRITICAL_CONTAINERS = [
|
||||
"openremote_manager_1", "openremote_keycloak_1", "smart-city-simulator",
|
||||
"emqx_emqx_1", "mainfluxlabs-broker", "stellio-api-gateway",
|
||||
"smart-city-influxdb", "smart-city-grafana", "traefik",
|
||||
"smart-city-prometheus-brokers"
|
||||
]
|
||||
|
||||
ENDPOINTS = [
|
||||
("OpenRemote", "https://openremote.digitribe.fr"),
|
||||
("Grafana", "https://grafana.digitribe.fr"),
|
||||
("Orion-LD", "http://fiware-gis-quickstart-orion-1:1026/version"),
|
||||
("Stellio", "https://stellio.digitribe.fr"),
|
||||
("FROST", "http://frost_http-web-1:8080/FROST-Server/core/v1.0/info")
|
||||
]
|
||||
|
||||
NETWORK = "smartcity-shared"
|
||||
TELEGRAM_USER = "@ericf972" # Will be used by Hermes send_message
|
||||
|
||||
def run_cmd(cmd):
|
||||
"""Run shell command and return output"""
|
||||
try:
|
||||
result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=10)
|
||||
return result.stdout.strip(), result.stderr.strip(), result.returncode
|
||||
except Exception as e:
|
||||
return "", str(e), 1
|
||||
|
||||
def check_containers():
|
||||
"""Check if critical containers are running"""
|
||||
issues = []
|
||||
for container in CRITICAL_CONTAINERS:
|
||||
cmd = f"docker ps --format '{{{{.Names}}}}' | grep -w '{container}'"
|
||||
out, err, code = run_cmd(cmd)
|
||||
if not out:
|
||||
issues.append(f"🛑 Container DOWN: {container}")
|
||||
return issues
|
||||
|
||||
def check_endpoints():
|
||||
"""Check if key endpoints are accessible"""
|
||||
issues = []
|
||||
for name, url in ENDPOINTS:
|
||||
cmd = f"curl -k -s -o /dev/null -w '%{{http_code}}' --connect-timeout 5 {url}"
|
||||
out, err, code = run_cmd(cmd)
|
||||
if code != 0 or out not in ["200", "301", "302"]:
|
||||
issues.append(f"🌐 Endpoint DOWN: {name} ({url}) - HTTP {out}")
|
||||
return issues
|
||||
|
||||
def check_network():
|
||||
"""Check network connectivity between containers"""
|
||||
issues = []
|
||||
# Check if Traefik can reach OpenRemote
|
||||
cmd = "docker exec traefik wget -q --spider http://openremote_manager_1:8080 2>&1"
|
||||
out, err, code = run_cmd(cmd)
|
||||
if code != 0:
|
||||
issues.append(f"🔌 Network issue: Traefik → OpenRemote")
|
||||
return issues
|
||||
|
||||
def check_resources():
|
||||
"""Check system resources"""
|
||||
issues = []
|
||||
# Disk space
|
||||
cmd = "df -h / | awk 'NR==2 {print $5}' | tr -d '%'"
|
||||
out, err, code = run_cmd(cmd)
|
||||
if out and int(out) > 80:
|
||||
issues.append(f"💾 Disk space critical: {out}% used")
|
||||
# Memory
|
||||
cmd = "free | awk '/Mem:/ {print int($3/$2 * 100)}'"
|
||||
out, err, code = run_cmd(cmd)
|
||||
if out and int(out) > 90:
|
||||
issues.append(f"🧠 Memory critical: {out}% used")
|
||||
return issues
|
||||
|
||||
def main():
|
||||
"""Main monitoring function"""
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
all_issues = []
|
||||
|
||||
print(f"🔍 Smart City Monitoring Check - {timestamp}")
|
||||
print("=" * 50)
|
||||
|
||||
# Run all checks
|
||||
all_issues.extend(check_containers())
|
||||
all_issues.extend(check_endpoints())
|
||||
all_issues.extend(check_network())
|
||||
all_issues.extend(check_resources())
|
||||
|
||||
# Output results
|
||||
if all_issues:
|
||||
print(f"⚠️ ALERT: {len(all_issues)} issue(s) detected!")
|
||||
for issue in all_issues:
|
||||
print(f" - {issue}")
|
||||
# This output will be captured by Hermes cron job and sent to Telegram
|
||||
sys.exit(1) # Non-zero exit code indicates issues
|
||||
else:
|
||||
print("✅ All systems operational")
|
||||
sys.exit(0)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user