#!/usr/bin/env python3
"""
Kuhn & Rueß GmbH
Consulting and Development
https://kuhn-ruess.de

Special agent: fetch status RSS/Atom feeds and emit one section line per
configured feed. Each line is a JSON object describing reachability, the
latest item and the detected incident state.

Generalised from the original AWS-only agent: works with the per-service
status feeds published by AWS (https://status.aws.amazon.com/) as well as
Statuspage-style incident-history feeds (e.g. https://status.scrivito.com/
incidents.atom).
"""
import argparse
import json
import re
import sys
from datetime import datetime, timezone
from email.utils import parsedate_to_datetime
from urllib.error import HTTPError, URLError
from urllib.request import ProxyHandler, Request, build_opener
from xml.etree import ElementTree as ET


# Keywords used to classify the most recent feed entry. Statuspage-style
# feeds embed the lifecycle state in the entry text; AWS service feeds only
# publish an entry while an event is in flight.
ACTIVE_KEYWORDS = (
    "investigating",
    "identified",
    "monitoring",
    "degraded",
    "degradation",
    "partial outage",
    "major outage",
    "service disruption",
    "disruption",
    "increased error",
    "elevated error",
    "in progress",
    "ongoing",
)
RESOLVED_KEYWORDS = (
    "resolved",
    "operational",
    "completed",
    "back to normal",
    "fully restored",
    "all systems",
)


def parse_arguments():
    parser = argparse.ArgumentParser(description="Status RSS/Atom feed monitoring")
    parser.add_argument(
        "--feed",
        action="append",
        default=[],
        metavar="NAME=URL",
        help="Feed in the form 'Name=https://...'. May be given multiple times.",
    )
    parser.add_argument(
        "--timeout",
        type=float,
        default=15.0,
        help="HTTP timeout in seconds (default: 15)",
    )
    parser.add_argument(
        "--user-agent",
        default="checkmk-status-feed/1.0",
        help="User-Agent header sent with each request.",
    )
    proxy_group = parser.add_mutually_exclusive_group()
    proxy_group.add_argument(
        "--proxy",
        metavar="URL",
        help="Route all feed requests through this HTTP proxy "
        "(e.g. http://proxy.example:3128).",
    )
    proxy_group.add_argument(
        "--no-proxy",
        action="store_true",
        help="Connect directly, ignoring any proxy set in the environment.",
    )
    args = parser.parse_args()
    if not args.feed:
        parser.error("At least one --feed NAME=URL is required")
    return args


def split_feed(spec):
    if "=" not in spec:
        raise ValueError(f"feed spec must be NAME=URL, got: {spec!r}")
    name, url = spec.split("=", 1)
    name = name.strip()
    url = url.strip()
    if not name or not url:
        raise ValueError(f"empty name or url in feed spec: {spec!r}")
    return name, url


def parse_date(value):
    if not value:
        return None
    value = value.strip()
    try:
        dt = parsedate_to_datetime(value)
    except (TypeError, ValueError):
        dt = None
    if dt is None:
        try:
            dt = datetime.fromisoformat(value.replace("Z", "+00:00"))
        except ValueError:
            return None
    if dt.tzinfo is None:
        dt = dt.replace(tzinfo=timezone.utc)
    return dt


def _strip_html(text):
    text = re.sub(r"<[^>]+>", " ", text)
    text = (
        text.replace("&nbsp;", " ")
        .replace("&amp;", "&")
        .replace("&lt;", "<")
        .replace("&gt;", ">")
    )
    return re.sub(r"\s+", " ", text).strip()


def classify_state(title, summary):
    """Best-effort lifecycle classification of an incident entry.

    Returns "active", "resolved" or "unknown". Statuspage feeds list the
    newest update first, so the leading keyword wins.
    """
    haystack = f"{title} {summary}".lower()
    pos = {kw: haystack.find(kw) for kw in ACTIVE_KEYWORDS if kw in haystack}
    pos.update({kw: haystack.find(kw) for kw in RESOLVED_KEYWORDS if kw in haystack})
    if not pos:
        return "unknown"
    first = min(pos, key=pos.get)
    return "resolved" if first in RESOLVED_KEYWORDS else "active"


def extract_items(xml_text):
    """
    Return list of dicts {title, published, summary} sorted newest first.
    Supports RSS 2.0 (<item>) and Atom (<entry>); for Atom the human-readable
    text may live in <summary> or <content>.
    """
    root = ET.fromstring(xml_text)
    ns = {"atom": "http://www.w3.org/2005/Atom"}
    items = []

    for item in root.iter("item"):
        items.append({
            "title": (item.findtext("title") or "").strip(),
            "published": (item.findtext("pubDate") or "").strip(),
            "summary": _strip_html(item.findtext("description") or ""),
        })

    for entry in root.iter("{http://www.w3.org/2005/Atom}entry"):
        title_el = entry.find("atom:title", ns)
        published = (
            entry.findtext("atom:updated", default="", namespaces=ns)
            or entry.findtext("atom:published", default="", namespaces=ns)
        )
        summary_el = entry.find("atom:summary", ns)
        content_el = entry.find("atom:content", ns)
        body_el = summary_el if summary_el is not None else content_el
        items.append({
            "title": (title_el.text or "").strip() if title_el is not None else "",
            "published": published.strip(),
            "summary": _strip_html(body_el.text or "") if body_el is not None else "",
        })

    def sort_key(it):
        dt = parse_date(it["published"])
        return dt or datetime.min.replace(tzinfo=timezone.utc)

    items.sort(key=sort_key, reverse=True)
    return items


def build_opener_for_proxy(proxy, no_proxy):
    """Return a urllib opener honouring the requested proxy behaviour.

    * ``--proxy URL`` routes every request through that proxy.
    * ``--no-proxy`` disables proxies entirely (an empty ProxyHandler).
    * neither flag keeps urllib's default, i.e. the proxies configured via
      the process environment (http_proxy/https_proxy/no_proxy).
    """
    if proxy:
        return build_opener(ProxyHandler({"http": proxy, "https": proxy}))
    if no_proxy:
        return build_opener(ProxyHandler({}))
    return build_opener()


def probe_feed(name, url, timeout, user_agent, opener):
    result = {
        "name": name,
        "url": url,
        "ok": False,
        "error": None,
        "items": 0,
        "latest_title": None,
        "latest_published": None,
        "latest_age_seconds": None,
        "latest_summary": None,
        "latest_state": None,
        "http_status": None,
    }
    try:
        req = Request(url, headers={"User-Agent": user_agent})
        with opener.open(req, timeout=timeout) as resp:
            result["http_status"] = resp.status
            body = resp.read()
    except HTTPError as exc:
        result["http_status"] = exc.code
        result["error"] = f"HTTP {exc.code}: {exc.reason}"
        return result
    except (URLError, TimeoutError) as exc:
        result["error"] = f"connection error: {exc}"
        return result
    except Exception as exc:
        result["error"] = f"unexpected error: {exc}"
        return result

    try:
        items = extract_items(body.decode("utf-8", errors="replace"))
    except ET.ParseError as exc:
        result["error"] = f"feed is not valid XML: {exc}"
        return result

    result["ok"] = True
    result["items"] = len(items)
    if items:
        latest = items[0]
        result["latest_title"] = latest["title"] or None
        result["latest_published"] = latest["published"] or None
        result["latest_summary"] = (latest["summary"] or "")[:500] or None
        result["latest_state"] = classify_state(latest["title"], latest["summary"])
        dt = parse_date(latest["published"])
        if dt is not None:
            now = datetime.now(timezone.utc)
            result["latest_age_seconds"] = max(0, int((now - dt).total_seconds()))
    return result


def main():
    args = parse_arguments()
    opener = build_opener_for_proxy(args.proxy, args.no_proxy)
    print("<<<status_feed:sep(0)>>>")
    for spec in args.feed:
        try:
            name, url = split_feed(spec)
        except ValueError as exc:
            print(json.dumps({
                "name": spec,
                "url": "",
                "ok": False,
                "error": str(exc),
            }))
            continue
        info = probe_feed(name, url, args.timeout, args.user_agent, opener)
        print(json.dumps(info, sort_keys=True))
    return 0


if __name__ == "__main__":
    sys.exit(main())
