# extractly/management/commands/parser_health.py
from django.core.management.base import BaseCommand
from django.db.models import Q, Count
from django.utils.timezone import now
from django.core.exceptions import FieldDoesNotExist, FieldError
from django.db import models
from pathlib import Path
import json
import re

from extractly.models import NetworkMonitoredPage


def _is_text_field(model, field_name: str) -> bool:
    """Sprawdza czy pole (liść, bez '__') jest Char/Text na danym modelu."""
    try:
        f = model._meta.get_field(field_name)
    except FieldDoesNotExist:
        return False
    return isinstance(f, (models.CharField, models.TextField))


def _empty_q(field_path: str) -> Q:
    """
    Q wykrywające 'pustą' wartość w polu/ścieżce:
      - zawsze IS NULL
      - dla pól tekstowych także whitespace-only przez __regex
    """
    if "__" in field_path:
        return Q(**{f"{field_path}__isnull": True}) | Q(**{f"{field_path}__regex": r"^\s*$"})
    if _is_text_field(NetworkMonitoredPage, field_name=field_path):
        return Q(**{f"{field_path}__isnull": True}) | Q(**{f"{field_path}__regex": r"^\s*$"})
    return Q(**{f"{field_path}__isnull": True})


def _count_total_empty_for_field(field_path: str) -> tuple[int, bool]:
    """Zwraca (count, null_only_fallback)."""
    q = _empty_q(field_path)
    try:
        return NetworkMonitoredPage.objects.filter(q).count(), False
    except FieldError:
        return NetworkMonitoredPage.objects.filter(**{f"{field_path}__isnull": True}).count(), True


def _count_total_empty_inactive_for_field(field_path: str) -> tuple[int, bool]:
    """Zwraca (count_inactive, null_only_fallback) dla pustych z is_active=False."""
    q = _empty_q(field_path)
    try:
        return NetworkMonitoredPage.objects.filter(q, is_active=False).count(), False
    except FieldError:
        # fallback tylko IS NULL
        return NetworkMonitoredPage.objects.filter(**{f"{field_path}__isnull": True}, is_active=False).count(), True


def _build_annotations_for_fields(fields: list[str], use_null_only: bool = False) -> dict:
    """
    Adnotacje do per-name; dodaje:
      - empty__{fp} dla każdego pola
      - inactive__{fp} dla pustych + is_active=False
      - empty_any = OR(po wszystkich _empty_q)
      - inactive_empty_any = OR(po wszystkich _empty_q) + is_active=False
    """
    ann = {}
    # per-field
    for fp in fields:
        empty_alias = "empty__" + fp.replace("__", "_")
        inactive_alias = "inactive__" + fp.replace("__", "_")
        if use_null_only:
            q_empty = Q(**{f"{fp}__isnull": True})
        else:
            q_empty = _empty_q(fp)
        ann[empty_alias] = Count("id", filter=q_empty)
        ann[inactive_alias] = Count("id", filter=q_empty & Q(is_active=False))

    # any-empty
    if use_null_only:
        any_q = Q()
        for fp in fields:
            any_q |= Q(**{f"{fp}__isnull": True})
    else:
        any_q = Q()
        for fp in fields:
            any_q |= _empty_q(fp)

    ann["empty_any"] = Count("id", filter=any_q)
    ann["inactive_empty_any"] = Count("id", filter=any_q & Q(is_active=False))
    return ann


def _count_snapshot(*, group: bool, limit: int, field: str, fields: list[str], selected_name: str | None):
    total = NetworkMonitoredPage.objects.count()

    fields = [f.strip() for f in (fields or [field]) if f and f.strip()]
    empty_total_by_field: dict[str, int] = {}
    inactive_total_by_field: dict[str, int] = {}
    null_only_fallback_fields: list[str] = []
    null_only_fallback_fields_inactive: list[str] = []

    for fp in fields:
        cnt, null_only = _count_total_empty_for_field(fp)
        empty_total_by_field[fp] = cnt
        if null_only:
            null_only_fallback_fields.append(fp)

        cnt_inact, null_only_inact = _count_total_empty_inactive_for_field(fp)
        inactive_total_by_field[fp] = cnt_inact
        if null_only_inact:
            null_only_fallback_fields_inactive.append(fp)

    # primary
    empty_total = empty_total_by_field[fields[0]]
    inactive_total_primary = inactive_total_by_field[fields[0]]

    by_name_top = None
    by_name = None
    selected = None

    if group:
        try:
            annotations = _build_annotations_for_fields(fields, use_null_only=False)
            rows = (
                NetworkMonitoredPage.objects.values("name")
                .annotate(total=Count("id"), **annotations)
                .order_by("-empty_any", "-total")
            )
        except FieldError:
            annotations = _build_annotations_for_fields(fields, use_null_only=True)
            rows = (
                NetworkMonitoredPage.objects.values("name")
                .annotate(total=Count("id"), **annotations)
                .order_by("-empty_any", "-total")
            )

        by_name = {}
        for r in rows:
            key = r["name"] or "(null)"
            entry = {
                "total": r["total"],
                "empty_any": r.get("empty_any", 0),
                "inactive_empty_any": r.get("inactive_empty_any", 0),
                "empty_by_field": {},
                "inactive_by_field": {},
            }
            for fp in fields:
                e_alias = "empty__" + fp.replace("__", "_")
                i_alias = "inactive__" + fp.replace("__", "_")
                entry["empty_by_field"][fp] = r.get(e_alias, 0)
                entry["inactive_by_field"][fp] = r.get(i_alias, 0)
            by_name[key] = entry

        by_name_top = dict(list(by_name.items())[: max(1, limit)])

        if selected_name is not None:
            if selected_name in by_name:
                selected = {"name": selected_name, **by_name[selected_name]}
            else:
                base = NetworkMonitoredPage.objects.filter(name=selected_name)
                total_sel = base.count()
                empty_map, inactive_map = {}, {}
                for fp in fields:
                    cnt, nlf = _count_total_empty_for_field_for_q(base, fp)
                    empty_map[fp] = cnt
                    if nlf and fp not in null_only_fallback_fields:
                        null_only_fallback_fields.append(fp)

                    # inactive
                    try:
                        q = _empty_q(fp)
                        inactive_map[fp] = base.filter(q, is_active=False).count()
                    except FieldError:
                        inactive_map[fp] = base.filter(**{f"{fp}__isnull": True}, is_active=False).count()

                # any-empty & inactive_any
                try:
                    any_q = _or_q_for_fields(fields)
                    empty_any_cnt = base.filter(any_q).count()
                    inactive_any_cnt = base.filter(any_q, is_active=False).count()
                except FieldError:
                    any_q = Q()
                    for fp in fields:
                        any_q |= Q(**{f"{fp}__isnull": True})
                    empty_any_cnt = base.filter(any_q).count()
                    inactive_any_cnt = base.filter(any_q, is_active=False).count()

                selected = {
                    "name": selected_name,
                    "total": total_sel,
                    "empty_any": empty_any_cnt,
                    "inactive_empty_any": inactive_any_cnt,
                    "empty_by_field": empty_map,
                    "inactive_by_field": inactive_map,
                }

    snap = {
        "ts": now().isoformat(),
        "fields": fields,
        "primary_field": fields[0],
        "total_pages": total,

        "empty_total": empty_total,
        "empty_total_by_field": empty_total_by_field,

        # NOWE: globalne liczniki “pusty & nieaktywny”
        "inactive_total_primary": inactive_total_primary,
        "inactive_total_by_field": inactive_total_by_field,

        "null_only_fallback_fields": null_only_fallback_fields,
        "null_only_fallback_fields_inactive": null_only_fallback_fields_inactive,
    }
    if by_name is not None:
        snap["by_name"] = by_name
        snap["_by_name_top"] = by_name_top
    if selected is not None:
        snap["selected_name"] = selected

    return snap


def _or_q_for_fields(fields: list[str]) -> Q:
    q = Q()
    for fp in fields:
        q |= _empty_q(fp)
    return q


def _count_total_empty_for_field_for_q(qs, field_path: str) -> tuple[int, bool]:
    q = _empty_q(field_path)
    try:
        return qs.filter(q).count(), False
    except FieldError:
        return qs.filter(**{f"{field_path}__isnull": True}).count(), True


def _collect_empty_ids(fields: list[str], selected_name: str | None = None) -> list[int]:
    """Zwraca listę ID pustych w *którymkolwiek* z pól (opcjonalnie zawężone do name)."""
    qs = NetworkMonitoredPage.objects.all()
    if selected_name:
        qs = qs.filter(name=selected_name)
    try:
        ids = list(qs.filter(_or_q_for_fields(fields)).values_list("id", flat=True))
    except FieldError:
        any_q = Q()
        for fp in fields:
            any_q |= Q(**{f"{fp}__isnull": True})
        ids = list(qs.filter(any_q).values_list("id", flat=True))
    return ids


def _collect_empty_ids_grouped(fields: list[str], selected_name: str | None = None) -> dict:
    """
    Zwraca słownik: name -> {"count": int, "ids": [int, ...]}.
    Jeśli selected_name podane, zwraca tylko tę jedną grupę.
    """
    qs = NetworkMonitoredPage.objects.all()
    if selected_name:
        qs = qs.filter(name=selected_name)
    try:
        pairs = qs.filter(_or_q_for_fields(fields)).values_list("id", "name")
    except FieldError:
        any_q = Q()
        for fp in fields:
            any_q |= Q(**{f"{fp}__isnull": True})
        pairs = qs.filter(any_q).values_list("id", "name")

    grouped: dict[str, dict] = {}
    for _id, nm in pairs:
        key = nm or "(null)"
        bucket = grouped.setdefault(key, {"count": 0, "ids": []})
        bucket["ids"].append(_id)
        bucket["count"] += 1
    return grouped


def _print_snapshot(snap: dict):
    print("\nParser health — current snapshot")
    print(f"Time:            {snap['ts']}")
    print(f"Fields:          {', '.join(snap['fields'])}")
    print(f"Primary field:   {snap['primary_field']}")
    print(f"Total pages:     {snap['total_pages']}")
    print(f"Empty (primary): {snap['empty_total']}   (inactive: {snap.get('inactive_total_primary', 0)})")

    if snap.get("empty_total_by_field"):
        print("\nBy field (totals):")
        for fp in snap["fields"]:
            cnt = snap["empty_total_by_field"].get(fp, 0)
            inact = snap.get("inactive_total_by_field", {}).get(fp, 0)
            marker = "  (NULL-only)" if fp in snap.get("null_only_fallback_fields", []) else ""
            print(f"- {fp:40} empty={cnt} (inactive: {inact}){marker}")

    if "_by_name_top" in snap and snap["_by_name_top"]:
        print("\nBy name (top):")
        for name, vals in snap["_by_name_top"].items():
            print(f"- {name:30} total={vals['total']:6d}  empty_any={vals['empty_any']:6d} (inactive_any: {vals.get('inactive_empty_any',0):6d})")
            ebf = vals.get("empty_by_field", {})
            ibf = vals.get("inactive_by_field", {})
            for fp in snap["fields"]:
                print(f"    · {fp:36} {ebf.get(fp, 0):6d}  (inactive: {ibf.get(fp, 0):6d})")


def _append_txt(path: Path, snap: dict):
    lines = []
    lines.append(f"SNAPSHOT {snap['ts']}")
    lines.append(f"fields:             {', '.join(snap['fields'])}")
    lines.append(f"primary_field:      {snap['primary_field']}")
    lines.append(f"total_pages:        {snap['total_pages']}")
    lines.append(f"empty_primary:      {snap['empty_total']} (inactive: {snap.get('inactive_total_primary',0)})")
    if snap.get("empty_total_by_field"):
        lines.append("empty_total_by_field:")
        for fp in snap["fields"]:
            cnt = snap["empty_total_by_field"].get(fp, 0)
            inact = snap.get("inactive_total_by_field", {}).get(fp, 0)
            marker = " (NULL-only)" if fp in snap.get("null_only_fallback_fields", []) else ""
            lines.append(f"  - {fp}: {cnt} (inactive: {inact}){marker}")
    if snap.get("by_name"):
        lines.append("by_name:")
        for name, vals in snap["by_name"].items():
            lines.append(f"  - {name}: total={vals['total']}, empty_any={vals['empty_any']} (inactive_any: {vals.get('inactive_empty_any',0)})")
            for fp in snap["fields"]:
                lines.append(f"      · {fp}: {vals.get('empty_by_field',{}).get(fp,0)} (inactive: {vals.get('inactive_by_field',{}).get(fp,0)})")
    lines.append("---")
    path.parent.mkdir(parents=True, exist_ok=True)
    path.open("a", encoding="utf-8").write("\n".join(lines) + "\n")


def _append_jsonl(path: Path, snap: dict):
    path.parent.mkdir(parents=True, exist_ok=True)
    snap = {k: v for k, v in snap.items() if k != "_by_name_top"}
    path.open("a", encoding="utf-8").write(json.dumps(snap, ensure_ascii=False) + "\n")


def _append_json_array(path: Path, snap: dict):
    path.parent.mkdir(parents=True, exist_ok=True)
    snap = {k: v for k, v in snap.items() if k != "_by_name_top"}
    arr = []
    if path.exists():
        try:
            arr = json.loads(path.read_text(encoding="utf-8") or "[]")
            if not isinstance(arr, list):
                arr = []
        except Exception:
            arr = []
    arr.append(snap)
    path.write_text(json.dumps(arr, ensure_ascii=False, indent=2), encoding="utf-8")


def _show_last_from_file(path: Path, last: int):
    if last <= 0:
        return
    if not path.exists():
        print(f"\n(no history file at {path})")
        return

    ext = path.suffix.lower()
    print(f"\nLast {last} snapshots from {path}:")
    try:
        if ext in (".jsonl", ""):
            lines = path.read_text(encoding="utf-8").splitlines()
            for line in lines[-last:]:
                try:
                    snap = json.loads(line)
                    print(
                        f"- {snap.get('ts')}  fields={','.join(snap.get('fields', []))}  "
                        f"total={snap.get('total_pages')}  empty_primary={snap.get('empty_total')}"
                    )
                except Exception:
                    pass
        elif ext == ".json":
            arr = json.loads(path.read_text(encoding="utf-8") or "[]")
            for snap in arr[-last:]:
                print(
                    f"- {snap.get('ts')}  fields={','.join(snap.get('fields', []))}  "
                    f"total={snap.get('total_pages')}  empty_primary={snap.get('empty_total')}"
                )
        elif ext == ".txt":
            text = path.read_text(encoding="utf-8")
            blocks = re.split(r"^SNAPSHOT ", text, flags=re.MULTILINE)
            snaps = [b for b in blocks if b.strip()]
            for b in snaps[-last:]:
                first_line = b.splitlines()[0].strip()
                print(f"- {first_line}")
        else:
            print("(unknown extension; supported: .jsonl, .json, .txt)")
    except Exception as e:
        print(f"(failed to read history: {e})")


class Command(BaseCommand):
    help = (
        "Counts empties for given field(s) in NetworkMonitoredPage, grouped by 'name', "
        "and writes a snapshot to a TXT/JSON/JSONL file (with history). "
        "Use --print to output JSON list of IDs that are empty in ANY of the selected fields. "
        "Use --print-by-name to output JSON grouped by 'name' with IDs per group."
    )

    def add_arguments(self, parser):
        parser.add_argument("--output", "-o", default="parser_health.jsonl",
                            help="Output file path (.jsonl default), or .json/.txt")
        parser.add_argument("--group", action="store_true",
                            help="Include per-name breakdown in snapshot (console/file).")
        parser.add_argument("--limit", type=int, default=25,
                            help="How many 'name' groups to print in console (top-N).")
        parser.add_argument("--last", type=int, default=0,
                            help="After writing, display last N snapshots from the output file.")
        parser.add_argument("--no-print", action="store_true",
                            help="Do not print current snapshot to console.")
        parser.add_argument("--field", default="network_ad_manual",
                            help="Primary field (used when --fields not provided).")
        parser.add_argument("--fields",
                            help="Comma-separated fields (e.g. 'network_ad_manual__html,network_ad_manual__sliced_html').")
        parser.add_argument("--name",
                            help="Optional exact name filter for print modes and snapshot selected section.")
        parser.add_argument("--print", dest="print_ids", action="store_true",
                            help="Print JSON with IDs empty in ANY of the fields (flat list).")
        parser.add_argument("--print-by-name", dest="print_grouped", action="store_true",
                            help="Print JSON grouped by 'name' with IDs per group.")

    def handle(self, *args, **opts):
        out_path = Path(opts["output"])
        group = bool(opts.get("group"))
        limit = int(opts.get("limit") or 25)
        primary_field = str(opts.get("field") or "network_ad_manual")
        fields_raw = opts.get("fields")
        fields = [s.strip() for s in fields_raw.split(",")] if fields_raw else []
        selected_name = opts.get("name")
        fields = [f for f in (fields or [primary_field]) if f]

        # Tryb: płaska lista ID
        if opts.get("print_ids") and not opts.get("print_grouped"):
            ids = _collect_empty_ids(fields, selected_name=selected_name)
            payload = {
                "ts": now().isoformat(),
                "fields": fields,
                "name": selected_name,
                "match": "any",
                "count": len(ids),
                "ids": ids,
            }
            print(json.dumps(payload, ensure_ascii=False))
            return

        # Tryb: grupowanie po name
        if opts.get("print_grouped"):
            grouped = _collect_empty_ids_grouped(fields, selected_name=selected_name)
            total_ids = sum(bucket["count"] for bucket in grouped.values())
            # wygodny format listowy (stabilny porządek)
            groups_list = [
                {"name": name, "count": grouped[name]["count"], "ids": grouped[name]["ids"]}
                for name in sorted(grouped.keys(), key=lambda k: (k is None, k))
            ]
            payload = {
                "ts": now().isoformat(),
                "fields": fields,
                "name_filter": selected_name,
                "match": "any",
                "total_count": total_ids,
                "groups": groups_list,
            }
            print(json.dumps(payload, ensure_ascii=False))
            return

        # Standardowy snapshot
        snap = _count_snapshot(
            group=group,
            limit=limit,
            field=primary_field,
            fields=fields,
            selected_name=selected_name,
        )

        if not opts.get("no_print"):
            _print_snapshot(snap)

        ext = out_path.suffix.lower()
        if ext in (".jsonl", ""):
            _append_jsonl(out_path, snap)
            self.stdout.write(self.style.SUCCESS(f"\nSaved to {out_path} (JSONL append)"))
        elif ext == ".json":
            _append_json_array(out_path, snap)
            self.stdout.write(self.style.SUCCESS(f"\nSaved to {out_path} (JSON array)"))
        elif ext == ".txt":
            _append_txt(out_path, snap)
            self.stdout.write(self.style.SUCCESS(f"\nSaved to {out_path} (TXT append)"))
        else:
            _append_jsonl(out_path, snap)
            self.stdout.write(self.style.WARNING(f"\nUnknown extension '{ext}', saved as JSONL to {out_path}"))

        last_n = int(opts.get("last") or 0)
        if last_n > 0:
            _show_last_from_file(out_path, last_n)