# extractly/management/commands/ads_fields_health.py
from django.core.management.base import BaseCommand
from django.db.models import Q, Count, OuterRef, Subquery, IntegerField
from django.db import models
from django.utils.timezone import now
from pathlib import Path
import json
import re

from extractly.models import AdsManual, NetworkMonitoredPage  # ← podmień jeśli trzeba


# ===== konfiguracja domyślna =====
TARGET_MODEL = AdsManual

DEFAULT_FIELDS = [
    # główne
    "title", "description", "price", "currency", "price_per_m2", "rent",
    "address", "square_footage", "rooms", "bathrooms",
    "floor", "floors_num", "estate_condition", "heating_type",
    "land_area", "building_type", "energy_certificate", "market_type",
    "build_year", "media",
    # bool’e (puste to NULL)
    "elevator", "electricity", "water", "gas", "phone", "internet",
    "sewerage", "equipment", "garden", "garage", "basement", "attic",
    "terrace", "seprete_kitchen", "furnished",
    # inne
    "balcony", "parking_space",
    "site_id", "land_and_mortgage_register", "ownership_form", "available_from",
    "windows", "attic_type", "building_material", "security",
    "fencing", "access_road", "location", "plot_type", "dimensions",
    "premises_location", "purpose", "location_info", "roof",
    "recreational_house", "roof_covering", "construction", "height",
    "office_rooms", "social_facilities", "parking", "ramp",
    "floor_material", "lighting",
    # adres
    "country", "state", "province", "commune", "city", "district", "street",
    # inne przydatne
    "original_image_urls", "images", "advertiser_name", "advertiser_phone", "advertiser_type",
]


# ===== helpers =====
def _is_text_field(model, field_name: str) -> bool:
    try:
        f = model._meta.get_field(field_name)
    except Exception:
        return False
    return isinstance(f, (models.CharField, models.TextField))

def _is_json_like(model, field_name: str) -> bool:
    try:
        f = model._meta.get_field(field_name)
    except Exception:
        return False
    # brak natywnego JSONField w czystym django.models, ale najczęściej to to:
    return f.__class__.__name__ in {"JSONField", "HStoreField"}

def _empty_q(model, field_name: str) -> Q:
    # tekst: null lub whitespace-only
    if _is_text_field(model, field_name):
        return Q(**{f"{field_name}__isnull": True}) | Q(**{f"{field_name}__regex": r"^\s*$"})
    # JSON: tylko NULL (prosto i przenośnie; da się rozszerzyć o "puste []/{}")
    if _is_json_like(model, field_name):
        return Q(**{f"{field_name}__isnull": True})
    # bool/liczbowe/datowe: NULL
    return Q(**{f"{field_name}__isnull": True})

def _valid_model_fields(model) -> set[str]:
    return {f.name for f in model._meta.get_fields() if hasattr(f, "attname")}

def _sanitize_fields(model, fields: list[str]) -> list[str]:
    existing = _valid_model_fields(model)
    return [f for f in fields if f in existing]


def _name_subquery():
    # pobiera nazwę źródła (NetworkMonitoredPage.name) połączoną po network_ad_manual = this ad
    return Subquery(
        NetworkMonitoredPage.objects.filter(network_ad_manual_id=OuterRef("pk"))
        .values("name")[:1]
    )


def _build_annotations_for_fields(model, fields: list[str]) -> dict:
    ann = {}
    # OR z Q-ów, żeby policzyć "empty_any"
    any_q = Q()
    for fp in fields:
        q = _empty_q(model, fp)
        alias = "empty__" + fp
        ann[alias] = Count("id", filter=q)
        any_q |= q
    ann["empty_any"] = Count("id", filter=any_q)
    return ann


def _count_snapshot(model, *, fields: list[str], group: bool, limit: int):
    total = model.objects.count()

    # global totals per field
    empty_totals = {}
    for fp in fields:
        empty_totals[fp] = model.objects.filter(_empty_q(model, fp)).count()

    by_name_top = None
    by_name = None

    if group:
        rows = (
            model.objects
            .annotate(src_name=_name_subquery())
            .values("src_name")
            .annotate(total=Count("id"), **_build_annotations_for_fields(model, fields))
            .order_by("-empty_any", "-total")
        )
        by_name = {}
        for r in rows:
            key = r["src_name"] or "(null)"
            entry = {
                "total": r["total"],
                "empty_any": r["empty_any"],
                "empty_by_field": {},
            }
            for fp in fields:
                entry["empty_by_field"][fp] = r.get("empty__" + fp, 0)
            by_name[key] = entry
        by_name_top = dict(list(by_name.items())[: max(1, limit)])

    snap = {
        "ts": now().isoformat(),
        "model": model.__name__,
        "total_ads": total,
        "fields": fields,
        "empty_totals": empty_totals,
    }
    if by_name is not None:
        snap["by_name"] = by_name
        snap["_by_name_top"] = by_name_top
    return snap


def _print_snapshot(snap: dict):
    print("\nAds fields health — snapshot")
    print(f"Time:        {snap['ts']}")
    print(f"Model:       {snap['model']}")
    print(f"Total ads:   {snap['total_ads']}")

    print("\nEmpty totals by field:")
    for fp, cnt in snap["empty_totals"].items():
        print(f"- {fp:28} {cnt:6d}")

    if snap.get("_by_name_top"):
        print("\nBy source name (top):")
        for name, vals in snap["_by_name_top"].items():
            print(f"- {name:28} total={vals['total']:6d}  empty_any={vals['empty_any']:6d}")
            for fp in snap["fields"]:
                v = vals["empty_by_field"].get(fp, 0)
                print(f"    · {fp:24} {v:6d}")


def _append_jsonl(path: Path, snap: dict):
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("a", encoding="utf-8") as f:
        f.write(json.dumps(snap, ensure_ascii=False) + "\n")


class Command(BaseCommand):
    help = (
        "Counts empties for key Ads fields (global and grouped by source name), "
        "and writes a JSONL history. Prints a console summary."
    )

    def add_arguments(self, parser):
        parser.add_argument(
            "--fields",
            help="Comma separated list of fields on the Ads model. If omitted, uses built-in DEFAULT_FIELDS.",
        )
        parser.add_argument(
            "--group",
            action="store_true",
            help="Group by source name (NetworkMonitoredPage.name) and show top-N.",
        )
        parser.add_argument(
            "--limit",
            type=int,
            default=20,
            help="How many 'name' groups to print (top-N).",
        )
        parser.add_argument(
            "--output",
            "-o",
            default="ads_fields_health.jsonl",
            help="Path to JSONL output (appended).",
        )

    def handle(self, *args, **opts):
        raw = (opts.get("fields") or "").strip()
        fields = [s.strip() for s in raw.split(",") if s.strip()] if raw else DEFAULT_FIELDS
        fields = _sanitize_fields(TARGET_MODEL, fields)

        if not fields:
            self.stderr.write(self.style.ERROR("No valid fields to check on the target model."))
            return

        snap = _count_snapshot(
            TARGET_MODEL,
            fields=fields,
            group=bool(opts.get("group")),
            limit=int(opts.get("limit") or 20),
        )

        _print_snapshot(snap)
        _append_jsonl(Path(opts["output"]), snap)
        self.stdout.write(self.style.SUCCESS(f"\nSaved to {opts['output']} (JSONL append)"))
