"""
Server Monitor Engine
Consolidated health checks and utilities for the manual pipeline in one module.

Includes:
- NetworkHealthChecker: fetch layer metrics and warnings
- ParserHealthChecker: parse quality and field completeness (safe for Decimal fields)
- FreshnessChecker: throughput/lag/backlog with warnings
- ServerMonitorEngine: orchestration plus utilities migrated from:
  * debug_dump (debug HTML/selector bundles)
  * health_snapshot (portal metrics)
  * lint_selectors (selector schema checks)
"""

from __future__ import annotations

from pathlib import Path
from typing import Any, Dict, List, Optional, Set, Tuple
from collections import Counter
from datetime import timedelta
import statistics

from django.db.models import Count, Max, Q
from django.db.models.functions import Length
from django.utils.timezone import now

from bs4 import BeautifulSoup

from extractly.models import (
    SourceManual,
    NetworkMonitoredPage,
    NetworkPageError,
    AdsManual,
)


class NetworkHealthChecker:
    def check_fetch_health(self, portal: str, hours: int = 24) -> Dict:
        cutoff = now() - timedelta(hours=hours)
        pages = NetworkMonitoredPage.objects.filter(source__name=portal, created_at__gte=cutoff)
        total = pages.count()
        if total == 0:
            return {'healthy': False, 'reason': 'no_pages_fetched', 'metrics': {}}

        metrics = {
            'total_pages': total,
            'empty_html_count': pages.filter(html__isnull=True).count(),
            'empty_html_rate': pages.filter(html__isnull=True).count() / total,
            'empty_sliced_count': pages.filter(sliced_html__isnull=True).count(),
            'empty_sliced_rate': pages.filter(sliced_html__isnull=True).count() / total,
            'undersized_pages': pages.annotate(html_size=Length('html')).filter(html_size__lt=10000).count(),
            'duplicate_html_rate': self._check_duplicate_hashes(pages),
            'consecutive_error_streak': self._check_error_streaks(portal),
        }

        healthy = (
            metrics['empty_html_rate'] < 0.1 and
            metrics['empty_sliced_rate'] < 0.15 and
            metrics['consecutive_error_streak'] < 10
        )
        return {
            'healthy': healthy,
            'layer': 'network',
            'metrics': metrics,
            'warnings': self._generate_fetch_warnings(metrics),
        }

    def _check_duplicate_hashes(self, pages) -> float:
        if pages.count() < 5:
            return 0.0
        hashes = [hash(p.html) for p in pages[:50] if p.html]
        if not hashes:
            return 0.0
        most_common_hash, count = Counter(hashes).most_common(1)[0]
        return count / len(hashes) if len(hashes) else 0.0

    def _check_error_streaks(self, portal: str) -> int:
        errors = NetworkPageError.objects.filter(network_page__source__name=portal).order_by('-created_at')[:20]
        streak = 0
        for e in errors:
            if e:
                streak += 1
            else:
                break
        return streak

    def _generate_fetch_warnings(self, metrics: Dict) -> List[str]:
        warnings: List[str] = []
        if metrics['empty_html_rate'] > 0.2:
            warnings.append("HIGH: >20% pages with empty HTML - check network/auth")
        if metrics['duplicate_html_rate'] > 0.8:
            warnings.append("CRITICAL: Duplicate HTML detected - likely bot block/captcha")
        if metrics['undersized_pages'] > metrics['total_pages'] * 0.3:
            warnings.append("WARNING: Many undersized pages - possible captcha redirect")
        if metrics['consecutive_error_streak'] > 5:
            warnings.append(f"CRITICAL: {metrics['consecutive_error_streak']} consecutive errors")
        return warnings


class ParserHealthChecker:
    CRITICAL_FIELDS: Dict[str, float] = {
        'price': 0.25,
        'currency': 0.10,
        'title': 0.20,
        'square_footage': 0.15,
        'address': 0.15,
        'city': 0.10,
        'description': 0.05,
    }
    SECONDARY_FIELDS: List[str] = ['rooms', 'bathrooms', 'floor', 'estate_condition', 'lon', 'lat', 'advertiser_name']

    def check_parser_health(self, portal: str, hours: int = 24) -> Dict:
        cutoff = now() - timedelta(hours=hours)
        ads = AdsManual.objects.filter(networkmonitoredpage__source__name=portal, created_at__gte=cutoff)
        total = ads.count()
        if total == 0:
            return {'healthy': False, 'reason': 'no_ads_parsed', 'metrics': {}}

        errors = NetworkPageError.objects.filter(network_page__source__name=portal, created_at__gte=cutoff).count()
        metrics = {
            'total_ads': total,
            'parsed_ads': ads.filter(has_data=True).count(),
            'parse_rate': ads.filter(has_data=True).count() / total,
            'error_count': errors,
            'error_rate': errors / (total + errors) if (total + errors) else 0.0,
            'critical_fields_score': self._check_critical_fields(ads),
            'secondary_fields_score': self._check_secondary_fields(ads),
            'price_anomalies': self._detect_price_anomalies(ads),
            'duplicate_titles_rate': self._check_duplicate_titles(ads),
            'truncated_descriptions': self._check_truncated_descriptions(ads),
            'partial_records': self._count_partial_records(ads),
            'partial_rate': self._count_partial_records(ads) / total,
        }

        healthy = (
            metrics['parse_rate'] > 0.7 and
            metrics['error_rate'] < 0.15 and
            metrics['critical_fields_score'] > 0.75
        )
        return {
            'healthy': healthy,
            'layer': 'parser',
            'metrics': metrics,
            'warnings': self._generate_parser_warnings(metrics),
            'field_breakdown': self._get_field_breakdown(ads),
        }

    def _fill_rate(self, ads, field: str) -> float:
        total = ads.count()
        if total == 0:
            return 0.0
        from django.db.models import CharField, TextField
        try:
            field_obj = AdsManual._meta.get_field(field)
        except Exception:
            return 0.0
        qs = ads.exclude(**{f"{field}__isnull": True})
        if isinstance(field_obj, (CharField, TextField)):
            qs = qs.exclude(**{field: ''})
        return qs.count() / total

    def _check_critical_fields(self, ads) -> float:
        total_score = 0.0
        for field, weight in self.CRITICAL_FIELDS.items():
            total_score += self._fill_rate(ads, field) * weight
        return total_score

    def _check_secondary_fields(self, ads) -> float:
        rates = [self._fill_rate(ads, f) for f in self.SECONDARY_FIELDS]
        return statistics.mean(rates) if rates else 0.0

    def _detect_price_anomalies(self, ads) -> Dict:
        return {
            'zero_prices': ads.filter(price=0).count(),
            'negative_prices': ads.filter(price__lt=0).count(),
            'suspiciously_low': ads.filter(price__gt=0, price__lt=100).count(),
            'suspiciously_high': ads.filter(price__gt=10000000).count(),
        }

    def _check_duplicate_titles(self, ads) -> float:
        total = ads.count()
        if total < 10:
            return 0.0
        title_counts = ads.values('title').annotate(count=Count('id')).filter(count__gt=1)
        duplicate_count = sum(tc['count'] for tc in title_counts)
        return duplicate_count / total if total else 0.0

    def _check_truncated_descriptions(self, ads) -> int:
        return ads.filter(description__isnull=False).filter(Q(description__endswith='...') | Q(description__endswith='…')).count()

    def _count_partial_records(self, ads) -> int:
        return ads.filter(has_data=True).filter(Q(price__isnull=True) | Q(title__isnull=True) | Q(city__isnull=True)).count()

    def _get_field_breakdown(self, ads) -> Dict:
        total = ads.count()
        if total == 0:
            return {}
        breakdown: Dict[str, Dict] = {}
        for field in list(self.CRITICAL_FIELDS.keys()) + self.SECONDARY_FIELDS:
            rate = self._fill_rate(ads, field)
            breakdown[field] = {'filled': int(rate * total), 'fill_rate': rate}
        return breakdown

    def _generate_parser_warnings(self, metrics: Dict) -> List[str]:
        warnings: List[str] = []
        if metrics['parse_rate'] < 0.5:
            warnings.append("CRITICAL: Parse rate <50% - major selector breakage")
        elif metrics['parse_rate'] < 0.7:
            warnings.append("HIGH: Parse rate <70% - selectors degrading")
        if metrics['error_rate'] > 0.3:
            warnings.append("CRITICAL: Error rate >30% - investigate top errors")
        if metrics['critical_fields_score'] < 0.6:
            warnings.append("HIGH: Critical fields <60% filled - check key selectors")
        if metrics['partial_rate'] > 0.2:
            warnings.append(f"WARNING: {metrics['partial_rate']:.1%} partial records")
        price_anom = metrics['price_anomalies']
        if price_anom['zero_prices'] + price_anom['negative_prices'] > 10:
            warnings.append("WARNING: Price extraction issues detected")
        return warnings


class FreshnessChecker:
    def check_freshness(self, portal: str) -> Dict:
        last_page = NetworkMonitoredPage.objects.filter(source__name=portal).aggregate(Max('created_at'))['created_at__max']
        last_ad = AdsManual.objects.filter(networkmonitoredpage__source__name=portal).aggregate(Max('created_at'))['created_at__max']
        backlog = NetworkMonitoredPage.objects.filter(source__name=portal, network_ad_manual__isnull=True, is_active=True).count()
        cutoff_24h = now() - timedelta(hours=24)
        pages_24h = NetworkMonitoredPage.objects.filter(source__name=portal, created_at__gte=cutoff_24h).count()
        ads_24h = AdsManual.objects.filter(networkmonitoredpage__source__name=portal, created_at__gte=cutoff_24h).count()
        processing_lag = None
        if last_page and last_ad:
            processing_lag = (last_page - last_ad).total_seconds() / 3600
        metrics = {
            'last_page_fetched': last_page.isoformat() if last_page else None,
            'last_ad_created': last_ad.isoformat() if last_ad else None,
            'processing_lag_hours': processing_lag,
            'backlog_size': backlog,
            'pages_per_hour_24h': pages_24h / 24,
            'ads_per_hour_24h': ads_24h / 24,
            'processing_efficiency': ads_24h / pages_24h if pages_24h > 0 else 0.0,
        }
        healthy = (processing_lag is None or processing_lag < 4) and backlog < 1000 and pages_24h > 100
        return {'healthy': healthy, 'layer': 'freshness', 'metrics': metrics, 'warnings': self._generate_freshness_warnings(metrics)}

    def _generate_freshness_warnings(self, metrics: Dict) -> List[str]:
        warnings: List[str] = []
        lag = metrics.get('processing_lag_hours')
        backlog = metrics.get('backlog_size', 0)
        pph = metrics.get('pages_per_hour_24h', 0)
        aph = metrics.get('ads_per_hour_24h', 0)
        eff = metrics.get('processing_efficiency', 0)
        if lag is not None and lag > 4:
            warnings.append(f"CRITICAL: Processing lag {lag:.1f}h exceeds 4h")
        if backlog > 1000:
            warnings.append(f"HIGH: Backlog size {backlog} > 1000; parser is falling behind")
        if pph < 5:
            warnings.append(f"WARNING: Low fetch rate ({pph:.1f} pages/hour) in last 24h")
        if aph < 5:
            warnings.append(f"WARNING: Low parse throughput ({aph:.1f} ads/hour) in last 24h")
        if eff < 0.5 and pph >= 5:
            warnings.append(f"WARNING: Low processing efficiency ({eff:.0%}); many pages not producing ads")
        return warnings


# --------- Consolidated utilities migrated from commands ---------

CORE_FIELDS: List[str] = [
    "price",
    "currency",
    "price_per_m2",
    "title",
    "description",
    "address",
    "square_footage",
    "rooms",
    "estate_type",
    "offer_type",
    "city",
]


class ServerMonitorEngine:
    def __init__(self):
        self.network = NetworkHealthChecker()
        self.parser = ParserHealthChecker()
        self.freshness = FreshnessChecker()

    def run_all(self, portal: str, hours: int = 24) -> Dict:
        net = self.network.check_fetch_health(portal, hours)
        par = self.parser.check_parser_health(portal, hours)
        fr = self.freshness.check_freshness(portal)
        return {'portal': portal, 'network': net, 'parser': par, 'freshness': fr}

    def health_snapshot(self, portals: Optional[List[str]] = None, limit: int = 25, thr_error: float = 0.3) -> Dict:
        """Generate health snapshot using NetworkMonitoredPage and AdsManual with data quality metrics."""
        ts = now().isoformat()
        if portals is None:
            sm_qs = SourceManual.objects.all()
        else:
            sm_qs = SourceManual.objects.filter(Q(name__in=portals) | Q(source__name__in=portals))

        rows: List[Dict[str, Any]] = []
        for sm in sm_qs:
            portal_name = sm.name or (sm.source.name if sm.source else "unknown")
            page_qs = NetworkMonitoredPage.objects.filter(source=sm.source) if sm.source_id else NetworkMonitoredPage.objects.filter(name=portal_name)
            
            total_pages = page_qs.count()
            parsed_pages = page_qs.filter(network_ad_manual__isnull=False).count()
            unparsed_pages = total_pages - parsed_pages
            is_active_pages = page_qs.filter(is_active=True).count()

            # Get all ads for this portal
            ad_ids = list(page_qs.filter(network_ad_manual__isnull=False).values_list("network_ad_manual_id", flat=True))
            ads_qs = AdsManual.objects.filter(id__in=ad_ids)
            total_ads = ads_qs.count()

            # Check data completeness for critical fields
            critical_fields = ['price', 'square_footage', 'city', 'title', 'rooms']
            complete_ads = 0
            incomplete_ads = 0
            
            for ad in ads_qs:
                is_complete = all([
                    getattr(ad, field) not in [None, '', 0, '0']
                    for field in critical_fields
                ])
                if is_complete:
                    complete_ads += 1
                else:
                    incomplete_ads += 1

            # Calculate fill rates for all core fields
            fills = {f: ads_qs.exclude(**{f"{f}__isnull": True}).exclude(**{f: ''}).count() for f in CORE_FIELDS}
            last_ad = ads_qs.order_by("-created_at").values_list("created_at", flat=True).first()

            # Calculate metrics
            parse_rate = (parsed_pages / total_pages) if total_pages else 0.0
            completion_rate = (complete_ads / total_ads) if total_ads else 0.0
            incomplete_rate = (incomplete_ads / total_ads) if total_ads else 0.0
            
            # Health is now based on data quality, not errors
            # Healthy if: parse_rate > 50% AND completion_rate > 50%
            healthy = parse_rate >= 0.5 and completion_rate >= 0.5

            rows.append({
                "portal": portal_name,
                "total_pages": total_pages,
                "parsed_pages": parsed_pages,
                "unparsed_pages": unparsed_pages,
                "is_active_pages": is_active_pages,
                "total_ads": total_ads,
                "complete_ads": complete_ads,
                "incomplete_ads": incomplete_ads,
                "parse_rate": round(parse_rate, 4),
                "completion_rate": round(completion_rate, 4),
                "incomplete_rate": round(incomplete_rate, 4),
                "error_rate": round(incomplete_rate, 4),  # Kept for backward compatibility
                "error_pages": incomplete_ads,  # Kept for backward compatibility
                "healthy": healthy,
                "last_ad_created": last_ad.isoformat() if last_ad else None,
                "fill_rates": {k: (v / total_ads) if total_ads else 0.0 for k, v in fills.items()},
                "fill_counts": fills,
            })

        rows.sort(key=lambda r: (-r["parse_rate"], -r["completion_rate"]), reverse=False)
        return {"ts": ts, "thresholds": {"error_rate": thr_error, "min_parse_rate": 0.5, "min_completion_rate": 0.5}, "portals": rows[: max(1, limit)]}

    def debug_dump(self, portals: Optional[List[str]] = None, all_portals: bool = False, limit: int = 25, only_unparsed: bool = False, only_errors: bool = False, out_dir: str = "debug/manual_dump", check_selectors: bool = False) -> Dict[str, Any]:
        if all_portals:
            sm_qs = SourceManual.objects.all()
        else:
            if not portals:
                raise ValueError("Provide portals list or set all_portals=True")
            sm_qs = SourceManual.objects.filter(Q(name__in=portals) | Q(source__name__in=portals))

        # Make path relative to server_monitor folder if not absolute
        out_root = Path(out_dir)
        if not out_root.is_absolute():
            # Make it relative to manual_agregator/server_monitor
            server_monitor_dir = Path(__file__).parent
            out_root = server_monitor_dir / out_dir
        
        out_root.mkdir(parents=True, exist_ok=True)
        summary: Dict[str, Any] = {
            "out": str(out_root.resolve()), 
            "bundles": 0,
            "portals_checked": 0,
            "pages_found": 0
        }

        for sm in sm_qs:
            summary["portals_checked"] += 1
            portal = sm.name or (sm.source.name if sm.source else "unknown")
            portal_dir = out_root / portal.lower()
            portal_dir.mkdir(parents=True, exist_ok=True)

            page_qs = NetworkMonitoredPage.objects.filter(source=sm.source) if sm.source_id else NetworkMonitoredPage.objects.filter(name=portal)
            if only_unparsed:
                page_qs = page_qs.filter(network_ad_manual__isnull=True)
            if only_errors:
                page_qs = page_qs.filter(networkpageerror__isnull=False)

            pages = list(page_qs.order_by("-created_at")[:limit])
            summary["pages_found"] += len(pages)
            for p in pages:
                bundle_dir = portal_dir / f"page_{p.id or 0}"
                bundle_dir.mkdir(parents=True, exist_ok=True)

                html = p.html or p.sliced_html or ""
                (bundle_dir / "page.html").write_text(html, encoding="utf-8")

                selectors = sm.selectors or {}
                import json as _json
                (bundle_dir / "selectors.json").write_text(_json.dumps(selectors, ensure_ascii=False, indent=2), encoding="utf-8")

                errs = list(NetworkPageError.objects.filter(network_page=p).values("error_type", "error_message", "created_at"))
                info: Dict[str, Any] = {
                    "url": p.url,
                    "name": p.name,
                    "is_active": p.is_active,
                    "created_at": getattr(p, "created_at", None).isoformat() if getattr(p, "created_at", None) else None,
                    "has_ad_manual": bool(p.network_ad_manual_id),
                    "html_size": len(html.encode("utf-8")) if html else 0,
                    "selector_count": len(selectors) if isinstance(selectors, dict) else 0,
                    "fetched_at": getattr(p, "created_at", None).isoformat() if getattr(p, "created_at", None) else None,
                    "error_count": len(errs),
                    "errors": errs,
                }

                if check_selectors and html:
                    try:
                        soup = BeautifulSoup(html, "html.parser")
                        sel_counts: Dict[str, Any] = {}
                        for key, cfg in (selectors.items() if isinstance(selectors, dict) else []):
                            try:
                                if isinstance(cfg, dict) and cfg.get("selector"):
                                    sel = cfg.get("selector")
                                    sel_counts[key] = len(soup.select(sel))
                                elif isinstance(cfg, dict) and cfg.get("selectors"):
                                    total = 0
                                    for sel in cfg.get("selectors") or []:
                                        total += len(soup.select(sel))
                                    sel_counts[key] = total
                            except Exception:
                                sel_counts[key] = -1
                        info["selector_hits"] = sel_counts
                    except Exception as e:
                        info["selector_check_error"] = str(e)

                (bundle_dir / "info.json").write_text(_json.dumps(info, ensure_ascii=False, indent=2), encoding="utf-8")
                summary["bundles"] += 1

        return summary

    KNOWN_CFG_KEYS: Set[str] = {
        "selector", "selectors", "fieldType", "label", "valueType", "keyMap",
        "ifMissing", "defaultValue", "trueOptions", "falseOptions", "paragraphs",
        "joinWith", "currencyField", "altLabels", "cast", "splitBy", "fromMain",
        "splitIndex", "maxField", "rangeMode", "allowRange", "specialMap", "isMain",
        "labelField",
    }

    def lint_selectors(self, portals: Optional[List[str]] = None, all_portals: bool = False) -> List[Dict[str, Any]]:
        if all_portals:
            sm_qs = SourceManual.objects.all()
        else:
            if not portals:
                raise ValueError("Provide portals list or set all_portals=True")
            sm_qs = SourceManual.objects.filter(Q(name__in=portals) | Q(source__name__in=portals))

        ads_fields: Set[str] = {f.name for f in AdsManual._meta.get_fields() if getattr(f, 'attname', None)}
        results: List[Dict[str, Any]] = []

        def _lint_selector_tree(tree: Dict[str, Any]) -> Tuple[List[str], List[str]]:
            unknown_fields: List[str] = []
            bad_props: List[str] = []

            def walk(node: Any, path: List[str]):
                if isinstance(node, dict):
                    looks_like_config_map = any(
                        isinstance(v, dict) and ("selector" in v or "selectors" in v or "fieldType" in v)
                        for v in node.values()
                    )
                    if looks_like_config_map:
                        for k, cfg in node.items():
                            if isinstance(cfg, dict) and ("selector" in cfg or "selectors" in cfg or "fieldType" in cfg):
                                if k not in ads_fields and k not in ("geo_json", "lat_coordinates_part", "lon_coordinates_part", "site_id_part"):
                                    unknown_fields.append("/".join(path + [k]))
                                for prop in cfg.keys():
                                    if prop not in self.KNOWN_CFG_KEYS:
                                        bad_props.append("/".join(path + [k, prop]))
                            else:
                                walk(cfg, path + [k])
                    else:
                        for k, v in node.items():
                            walk(v, path + [k])
                elif isinstance(node, list):
                    for i, v in enumerate(node):
                        walk(v, path + [str(i)])

            walk(tree, [])
            return unknown_fields, bad_props

        for sm in sm_qs:
            portal = sm.name or (sm.source.name if sm.source else "unknown")
            sel = sm.selectors or {}
            unknown, bad = _lint_selector_tree(sel)
            results.append({
                "portal": portal,
                "unknown_field_paths": sorted(set(unknown)),
                "bad_property_paths": sorted(set(bad)),
            })

        return results