Files
smart-city-digital-twin-mar…/scripts/smartcity_monitor.py

112 lines
3.6 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Smart City Digital Twin Martinique - Monitoring Script
Hybrid mode: Periodic checks + webhook-ready output
Alerts via Telegram when issues detected
"""
import subprocess
import json
import sys
from datetime import datetime
# Configuration
CRITICAL_CONTAINERS = [
"openremote_manager_1", "openremote_keycloak_1", "smart-city-simulator",
"emqx_emqx_1", "mainfluxlabs-broker", "stellio-api-gateway",
"smart-city-influxdb", "smart-city-grafana", "traefik",
"smart-city-prometheus-brokers"
]
ENDPOINTS = [
("OpenRemote", "https://openremote.digitribe.fr"),
("Grafana", "https://grafana.digitribe.fr"),
("Orion-LD", "http://fiware-gis-quickstart-orion-1:1026/version"),
("Stellio", "https://stellio.digitribe.fr"),
("FROST", "http://frost_http-web-1:8080/FROST-Server/core/v1.0/info")
]
NETWORK = "smartcity-shared"
TELEGRAM_USER = "@ericf972" # Will be used by Hermes send_message
def run_cmd(cmd):
"""Run shell command and return output"""
try:
result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=10)
return result.stdout.strip(), result.stderr.strip(), result.returncode
except Exception as e:
return "", str(e), 1
def check_containers():
"""Check if critical containers are running"""
issues = []
for container in CRITICAL_CONTAINERS:
cmd = f"docker ps --format '{{{{.Names}}}}' | grep -w '{container}'"
out, err, code = run_cmd(cmd)
if not out:
issues.append(f"🛑 Container DOWN: {container}")
return issues
def check_endpoints():
"""Check if key endpoints are accessible"""
issues = []
for name, url in ENDPOINTS:
cmd = f"curl -k -s -o /dev/null -w '%{{http_code}}' --connect-timeout 5 {url}"
out, err, code = run_cmd(cmd)
if code != 0 or out not in ["200", "301", "302"]:
issues.append(f"🌐 Endpoint DOWN: {name} ({url}) - HTTP {out}")
return issues
def check_network():
"""Check network connectivity between containers"""
issues = []
# Check if Traefik can reach OpenRemote
cmd = "docker exec traefik wget -q --spider http://openremote_manager_1:8080 2>&1"
out, err, code = run_cmd(cmd)
if code != 0:
issues.append(f"🔌 Network issue: Traefik → OpenRemote")
return issues
def check_resources():
"""Check system resources"""
issues = []
# Disk space
cmd = "df -h / | awk 'NR==2 {print $5}' | tr -d '%'"
out, err, code = run_cmd(cmd)
if out and int(out) > 80:
issues.append(f"💾 Disk space critical: {out}% used")
# Memory
cmd = "free | awk '/Mem:/ {print int($3/$2 * 100)}'"
out, err, code = run_cmd(cmd)
if out and int(out) > 90:
issues.append(f"🧠 Memory critical: {out}% used")
return issues
def main():
"""Main monitoring function"""
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
all_issues = []
print(f"🔍 Smart City Monitoring Check - {timestamp}")
print("=" * 50)
# Run all checks
all_issues.extend(check_containers())
all_issues.extend(check_endpoints())
all_issues.extend(check_network())
all_issues.extend(check_resources())
# Output results
if all_issues:
print(f"⚠️ ALERT: {len(all_issues)} issue(s) detected!")
for issue in all_issues:
print(f" - {issue}")
# This output will be captured by Hermes cron job and sent to Telegram
sys.exit(1) # Non-zero exit code indicates issues
else:
print("✅ All systems operational")
sys.exit(0)
if __name__ == "__main__":
main()