"""
Advanced Health Checker (lightweight)
Aggregates network, parser, and freshness signals into an overall health report.
"""
from __future__ import annotations

import json
import statistics
from dataclasses import dataclass
from datetime import datetime, timedelta
from enum import Enum
from typing import Dict, List

from django.core.management.base import BaseCommand
from django.utils.timezone import now
from django.db.models import Max, Q

from extractly.models import NetworkMonitoredPage, NetworkPageError, SourceManual, AdsManual


class Severity(Enum):
    CRITICAL = "critical"
    HIGH = "high"
    MEDIUM = "medium"
    LOW = "low"


class HealthStatus(Enum):
    HEALTHY = "healthy"
    DEGRADED = "degraded"
    CRITICAL = "critical"
    FAILING = "failing"


@dataclass
class HealthIssue:
    layer: str
    severity: Severity
    title: str
    description: str
    impact: str
    remediation_steps: List[str]


@dataclass
class HealthReport:
    portal: str
    timestamp: datetime
    overall_status: HealthStatus
    overall_score: float
    layers: Dict[str, Dict]
    issues: List[HealthIssue]
    recommendations: List[str]
    business_metrics: Dict


class SimpleHealthManager:
    CRITICAL_FIELDS_WEIGHTS: Dict[str, float] = {
        'price': 0.25,
        'currency': 0.10,
        'title': 0.20,
        'square_footage': 0.15,
        'address': 0.15,
        'city': 0.10,
        'description': 0.05,
    }

    def check_network(self, portal: str, hours: int) -> Dict:
        cutoff = now() - timedelta(hours=hours)
        pages = NetworkMonitoredPage.objects.filter(source__name=portal, created_at__gte=cutoff)
        total = pages.count()
        empty_html = pages.filter(html__isnull=True).count()
        empty_sliced = pages.filter(sliced_html__isnull=True).count()
        sample = list(pages.values_list('html', flat=True)[:50])
        hashes = [hash(h) for h in sample if h]
        dup_rate = 0.0
        if hashes:
            from collections import Counter
            c = Counter(hashes)
            dup_rate = (c.most_common(1)[0][1] / len(hashes)) if len(hashes) else 0.0
        recent_errors = NetworkPageError.objects.filter(network_page__source__name=portal).order_by('-created_at')[:20]
        streak = len(list(recent_errors))
        return {
            'total_pages': total,
            'empty_html_rate': (empty_html / total) if total else 0.0,
            'empty_sliced_rate': (empty_sliced / total) if total else 0.0,
            'duplicate_html_rate': dup_rate,
            'consecutive_error_streak': streak,
        }

    def _fill_rate(self, ads, field: str) -> float:
        total = ads.count()
        if total == 0:
            return 0.0
        from django.db.models import CharField, TextField
        field_obj = AdsManual._meta.get_field(field)
        qs = ads.exclude(**{f"{field}__isnull": True})
        if isinstance(field_obj, (CharField, TextField)):
            qs = qs.exclude(**{field: ''})
        return qs.count() / total

    def check_parser(self, portal: str, hours: int) -> Dict:
        cutoff = now() - timedelta(hours=hours)
        ads = AdsManual.objects.filter(networkmonitoredpage__source__name=portal, created_at__gte=cutoff)
        total = ads.count()
        parsed = ads.filter(has_data=True).count()
        errors = NetworkPageError.objects.filter(network_page__source__name=portal, created_at__gte=cutoff).count()
        parse_rate = (parsed / total) if total else 0.0
        error_rate = (errors / (total + errors)) if (total + errors) else 0.0
        crit_score = sum(self._fill_rate(ads, f) * w for f, w in self.CRITICAL_FIELDS_WEIGHTS.items())
        partial = ads.filter(has_data=True).filter(Q(price__isnull=True) | Q(title__isnull=True) | Q(city__isnull=True)).count()
        return {
            'total_ads': total,
            'parse_rate': parse_rate,
            'error_rate': error_rate,
            'critical_fields_score': crit_score,
            'partial_rate': (partial / total) if total else 0.0,
        }

    def check_freshness(self, portal: str) -> Dict:
        last_page = NetworkMonitoredPage.objects.filter(source__name=portal).aggregate(Max('created_at'))['created_at__max']
        last_ad = AdsManual.objects.filter(networkmonitoredpage__source__name=portal).aggregate(Max('created_at'))['created_at__max']
        backlog = NetworkMonitoredPage.objects.filter(source__name=portal, network_ad_manual__isnull=True, is_active=True).count()
        cutoff = now() - timedelta(hours=24)
        pages_24h = NetworkMonitoredPage.objects.filter(source__name=portal, created_at__gte=cutoff).count()
        ads_24h = AdsManual.objects.filter(networkmonitoredpage__source__name=portal, created_at__gte=cutoff).count()
        processing_lag = None
        if last_page and last_ad:
            processing_lag = (last_page - last_ad).total_seconds() / 3600.0
        return {
            'last_page_fetched': last_page.isoformat() if last_page else None,
            'last_ad_created': last_ad.isoformat() if last_ad else None,
            'processing_lag_hours': processing_lag,
            'backlog_size': backlog,
            'pages_per_hour_24h': (pages_24h / 24.0),
            'ads_per_hour_24h': (ads_24h / 24.0),
            'processing_efficiency': (ads_24h / pages_24h) if pages_24h else 0.0,
        }

    def run(self, portal: str, hours: int = 24) -> HealthReport:
        network = self.check_network(portal, hours)
        parser = self.check_parser(portal, hours)
        fresh = self.check_freshness(portal)
        net_score = 100.0
        if network['empty_html_rate'] > 0.2:
            net_score -= 30
        if network['empty_sliced_rate'] > 0.2:
            net_score -= 20
        if network['duplicate_html_rate'] > 0.8:
            net_score -= 40
        if network['consecutive_error_streak'] > 10:
            net_score -= 30
        net_score = max(0.0, min(100.0, net_score))
        par_score = max(0.0, min(100.0, 100.0 * parser['parse_rate'] - (30 if parser['error_rate'] > 0.3 else 0) - (20 if parser['critical_fields_score'] < 0.6 else 0)))
        fr_score = 100.0
        if fresh['processing_lag_hours'] and fresh['processing_lag_hours'] > 4:
            fr_score -= 30
        if fresh['backlog_size'] > 1000:
            fr_score -= 30
        if fresh['pages_per_hour_24h'] < 5:
            fr_score -= 20
        fr_score = max(0.0, min(100.0, fr_score))
        overall = statistics.mean([net_score, par_score, fr_score])
        if overall >= 80:
            status = HealthStatus.HEALTHY
        elif overall >= 60:
            status = HealthStatus.DEGRADED
        else:
            status = HealthStatus.CRITICAL
        issues: List[HealthIssue] = []
        recs: List[str] = []
        if parser['parse_rate'] < 0.5:
            issues.append(HealthIssue(
                layer='parser', severity=Severity.CRITICAL, title='Low Parse Rate',
                description=f"Parse rate at {parser['parse_rate']:.0%}, selectors likely broken",
                impact='Data extraction failing for many pages',
                remediation_steps=[f"Run: python manage.py manual_debug_dump --name {portal} --only-unparsed --limit 20",
                                   'Review selectors and update Configuration/<portal>/selector.json']
            ))
            recs.append('Run manual_debug_dump to inspect failing pages and update selectors for key fields')
        if network['duplicate_html_rate'] > 0.8:
            issues.append(HealthIssue(
                layer='network', severity=Severity.HIGH, title='Possible Bot Block',
                description='High duplicate HTML rate suggests bot/captcha blocking',
                impact='Pages fetched but not usable',
                remediation_steps=['Rotate proxies, add delays, emulate browser better']
            ))
            recs.append('Check bot protections; rotate proxies and add backoff')
        business = {
            'data_completeness': parser['critical_fields_score'] * 100.0,
            'data_freshness_hours': fresh['processing_lag_hours'],
            'operational_efficiency': min(100.0, max(0.0, fresh['processing_efficiency'] * 100.0)),
        }
        return HealthReport(
            portal=portal,
            timestamp=now(),
            overall_status=status,
            overall_score=overall,
            layers={'network': {'score': net_score, 'metrics': network}, 'parser': {'score': par_score, 'metrics': parser}, 'freshness': {'score': fr_score, 'metrics': fresh}},
            issues=issues,
            recommendations=recs,
            business_metrics=business,
        )


class Command(BaseCommand):
    help = 'Run a simple advanced health check for a portal or all portals'

    def add_arguments(self, parser):
        parser.add_argument('--portal', type=str, help='Portal name (e.g., otodom)')
        parser.add_argument('--all', action='store_true', help='Check all enabled portals')
        parser.add_argument('--hours', type=int, default=24, help='Hours to analyze (default 24)')
        parser.add_argument('--json', type=str, help='Write JSON output to path')
        parser.add_argument('--exit-on-critical', action='store_true', help='Exit code 2 if any critical')

    def handle(self, *args, **opts):
        portal = opts.get('portal')
        check_all = opts.get('all')
        hours = opts.get('hours')
        json_out = opts.get('json')
        exit_on_critical = opts.get('exit_on_critical')
        if not portal and not check_all:
            self.stderr.write(self.style.ERROR('Must specify --portal or --all'))
            return
        if check_all:
            portals = list(SourceManual.objects.filter(enable=True).values_list('name', flat=True))
        else:
            portals = [portal]
        mgr = SimpleHealthManager()
        reports: List[HealthReport] = []
        critical_any = False
        for p in portals:
            self.stdout.write(f"Checking: {p}")
            r = mgr.run(p, hours)
            reports.append(r)
            self._print_report(r)
            if r.overall_status in (HealthStatus.CRITICAL, HealthStatus.FAILING):
                critical_any = True
        if json_out:
            data = {'timestamp': now().isoformat(), 'reports': [self._to_dict(r) for r in reports]}
            with open(json_out, 'w', encoding='utf-8') as f:
                json.dump(data, f, ensure_ascii=False, indent=2)
            self.stdout.write(self.style.SUCCESS(f'JSON written to {json_out}'))
        if exit_on_critical and critical_any:
            raise SystemExit(2)

    def _print_report(self, r: HealthReport):
        self.stdout.write('-' * 72)
        self.stdout.write(self.style.HTTP_INFO(f"{r.portal} -> {r.overall_status.value.upper()} {r.overall_score:.1f}/100"))
        for layer, info in r.layers.items():
            self.stdout.write(f"  {layer}: {info['score']:.1f}/100")
        if r.issues:
            self.stdout.write('Issues:')
            for i in r.issues:
                self.stdout.write(f"  - [{i.severity.value}] {i.title}: {i.description}")
        if r.recommendations:
            self.stdout.write('Recommendations:')
            for rec in r.recommendations:
                self.stdout.write(f"  • {rec}")

    def _to_dict(self, r: HealthReport) -> Dict:
        return {
            'portal': r.portal,
            'timestamp': r.timestamp.isoformat(),
            'overall_status': r.overall_status.value,
            'overall_score': r.overall_score,
            'layers': r.layers,
            'issues': [
                {
                    'layer': i.layer,
                    'severity': i.severity.value,
                    'title': i.title,
                    'description': i.description,
                    'impact': i.impact,
                    'remediation_steps': i.remediation_steps,
                } for i in r.issues
            ],
            'recommendations': r.recommendations,
            'business_metrics': r.business_metrics,
        }