# extractly/management/commands/html_health.py
from django.core.management.base import BaseCommand
from django.db.models import Q, Count
from django.utils.timezone import now
from pathlib import Path
import json
import re

from extractly.models import NetworkMonitoredPage

def _count_snapshot(group: bool, limit: int):
    total = NetworkMonitoredPage.objects.count()

    # definicje "pustych"
    empty_html_q = Q(html__isnull=True) | Q(html__regex=r"^\s*$") | Q(html__exact="error")
    empty_sliced_q = Q(sliced_html__isnull=True) | Q(sliced_html__regex=r"^\s*$") | Q(sliced_html__exact="error")
    empty_both_q = empty_html_q & empty_sliced_q

    # globalne liczniki pustych
    empty_html = NetworkMonitoredPage.objects.filter(empty_html_q).count()
    empty_sliced_html = NetworkMonitoredPage.objects.filter(empty_sliced_q).count()
    empty_both = NetworkMonitoredPage.objects.filter(empty_both_q).count()

    # z pustych — ile ma is_active=False
    inactive_empty_html = NetworkMonitoredPage.objects.filter(empty_html_q, is_active=False).count()
    inactive_empty_sliced_html = NetworkMonitoredPage.objects.filter(empty_sliced_q, is_active=False).count()
    inactive_empty_both = NetworkMonitoredPage.objects.filter(empty_both_q, is_active=False).count()

    by_name_top = None
    if group:
        rows = (
            NetworkMonitoredPage.objects.values("name")
            .annotate(
                total=Count("id"),
                empty_html=Count("id", filter=empty_html_q),
                empty_sliced_html=Count("id", filter=empty_sliced_q),
                empty_both=Count("id", filter=empty_both_q),

                # nowe: z tych pustych – nieaktywne
                inactive_empty_html=Count("id", filter=empty_html_q & Q(is_active=False)),
                inactive_empty_sliced_html=Count("id", filter=empty_sliced_q & Q(is_active=False)),
                inactive_empty_both=Count("id", filter=empty_both_q & Q(is_active=False)),
            )
            .order_by("-empty_html", "-empty_sliced_html", "-total")
        )
        by_name = {}
        for r in rows:
            key = r["name"] or "(null)"
            by_name[key] = {
                "total": r["total"],
                "empty_html": r["empty_html"],
                "empty_sliced_html": r["empty_sliced_html"],
                "empty_both": r["empty_both"],
                "inactive_empty_html": r["inactive_empty_html"],
                "inactive_empty_sliced_html": r["inactive_empty_sliced_html"],
                "inactive_empty_both": r["inactive_empty_both"],
            }
        by_name_top = dict(list(by_name.items())[: max(1, limit)])
    else:
        by_name = None

    snap = {
        "ts": now().isoformat(),
        "total_pages": total,

        "empty_html":         empty_html,
        "empty_sliced_html":  empty_sliced_html,
        "empty_both":         empty_both,

        # nowe globalne liczniki „puste & nieaktywne”
        "inactive_empty_html":         inactive_empty_html,
        "inactive_empty_sliced_html":  inactive_empty_sliced_html,
        "inactive_empty_both":         inactive_empty_both,
    }
    if by_name is not None:
        snap["by_name"] = by_name
        snap["_by_name_top"] = by_name_top

    return snap


def _print_snapshot(snap: dict):
    print("")
    print("HTML health — current snapshot")
    print(f"Time:               {snap['ts']}")
    print(f"Total pages:        {snap['total_pages']}")
    print(f"Empty html:         {snap['empty_html']}   (inactive: {snap.get('inactive_empty_html', 0)})")
    print(f"Empty sliced_html:  {snap['empty_sliced_html']}   (inactive: {snap.get('inactive_empty_sliced_html', 0)})")
    print(f"Empty BOTH:         {snap['empty_both']}   (inactive: {snap.get('inactive_empty_both', 0)})")

    if "_by_name_top" in snap and snap["_by_name_top"]:
        print("")
        print("By name (top):")
        for name, vals in snap["_by_name_top"].items():
            print(
                f"- {name:30}  total={vals['total']:6d}  "
                f"empty_html={vals['empty_html']:6d} (inact:{vals.get('inactive_empty_html',0):6d})  "
                f"empty_sliced={vals['empty_sliced_html']:6d} (inact:{vals.get('inactive_empty_sliced_html',0):6d})  "
                f"both={vals['empty_both']:6d} (inact:{vals.get('inactive_empty_both',0):6d})"
            )




def _append_txt(path: Path, snap: dict):
    lines = []
    lines.append(f"SNAPSHOT {snap['ts']}")
    lines.append(f"total_pages:       {snap['total_pages']}")
    lines.append(f"empty_html:        {snap['empty_html']} (inactive: {snap.get('inactive_empty_html', 0)})")
    lines.append(f"empty_sliced_html: {snap['empty_sliced_html']} (inactive: {snap.get('inactive_empty_sliced_html', 0)})")
    lines.append(f"empty_both:        {snap['empty_both']} (inactive: {snap.get('inactive_empty_both', 0)})")
    if snap.get("by_name"):
        lines.append("by_name:")
        for name, vals in snap["by_name"].items():
            lines.append(
                f"  - {name}: total={vals['total']}, "
                f"empty_html={vals['empty_html']} (inact:{vals.get('inactive_empty_html',0)}), "
                f"empty_sliced_html={vals['empty_sliced_html']} (inact:{vals.get('inactive_empty_sliced_html',0)}), "
                f"both={vals['empty_both']} (inact:{vals.get('inactive_empty_both',0)})"
            )
    lines.append("---")
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("a", encoding="utf-8") as f:
        f.write("\n".join(lines) + "\n")



def _append_jsonl(path: Path, snap: dict):
    path.parent.mkdir(parents=True, exist_ok=True)
    snap = {k: v for k, v in snap.items() if k != "_by_name_top"}
    with path.open("a", encoding="utf-8") as f:
        f.write(json.dumps(snap, ensure_ascii=False) + "\n")


def _append_json_array(path: Path, snap: dict):
    path.parent.mkdir(parents=True, exist_ok=True)
    snap = {k: v for k, v in snap.items() if k != "_by_name_top"}
    arr = []
    if path.exists():
        try:
            arr = json.loads(path.read_text(encoding="utf-8") or "[]")
            if not isinstance(arr, list):
                arr = []
        except Exception:
            arr = []
    arr.append(snap)
    path.write_text(json.dumps(arr, ensure_ascii=False, indent=2), encoding="utf-8")


def _show_last_from_file(path: Path, last: int):
    if last <= 0:
        return
    if not path.exists():
        print(f"\n(no history file at {path})")
        return

    ext = path.suffix.lower()
    print(f"\nLast {last} snapshots from {path}:")
    try:
        if ext in (".jsonl", ""):
            lines = path.read_text(encoding="utf-8").splitlines()
            tail = lines[-last:]
            for line in tail:
                try:
                    snap = json.loads(line)
                    print(
                        f"- {snap.get('ts')}  total={snap.get('total_pages')}  "
                        f"empty_html={snap.get('empty_html')}  "
                        f"empty_sliced={snap.get('empty_sliced_html')}  "
                        f"both={snap.get('empty_both')}"
                    )
                except Exception:
                    pass
        elif ext == ".json":
            arr = json.loads(path.read_text(encoding="utf-8") or "[]")
            for snap in arr[-last:]:
                print(
                    f"- {snap.get('ts')}  total={snap.get('total_pages')}  "
                    f"empty_html={snap.get('empty_html')}  "
                    f"empty_sliced={snap.get('empty_sliced_html')}  "
                    f"both={snap.get('empty_both')}"
                )
        elif ext == ".txt":
            text = path.read_text(encoding="utf-8")
            blocks = re.split(r"^SNAPSHOT ", text, flags=re.MULTILINE)
            snaps = [b for b in blocks if b.strip()]
            for b in snaps[-last:]:
                first_line = b.splitlines()[0].strip()
                print(f"- {first_line}")
        else:
            print("(unknown extension; supported: .jsonl, .json, .txt)")
    except Exception as e:
        print(f"(failed to read history: {e})")


class Command(BaseCommand):
    help = "Counts empty HTML fields in NetworkMonitoredPage and writes a snapshot to a TXT or JSON file (with history)."

    def add_arguments(self, parser):
        parser.add_argument(
            "--output",
            "-o",
            default="html_health.jsonl",
            help="Output file path. Extension decides format: .jsonl (default), .json, or .txt",
        )
        parser.add_argument(
            "--group",
            action="store_true",
            help="Include per-name breakdown in snapshot (can be large).",
        )
        parser.add_argument(
            "--limit",
            type=int,
            default=25,
            help="How many 'name' groups to print in console (top-N).",
        )
        parser.add_argument(
            "--last",
            type=int,
            default=0,
            help="After writing, display last N snapshots from the output file.",
        )
        parser.add_argument(
            "--no-print",
            action="store_true",
            help="Do not print current snapshot to console (still writes to file).",
        )

    def handle(self, *args, **opts):
        out_path = Path(opts["output"])
        group = bool(opts.get("group"))
        limit = int(opts.get("limit") or 25)

        snap = _count_snapshot(group=group, limit=limit)

        if not opts.get("no_print"):
            _print_snapshot(snap)

        ext = out_path.suffix.lower()
        if ext in (".jsonl", ""):
            _append_jsonl(out_path, snap)
            self.stdout.write(self.style.SUCCESS(f"\nSaved to {out_path} (JSONL append)"))
        elif ext == ".json":
            _append_json_array(out_path, snap)
            self.stdout.write(self.style.SUCCESS(f"\nSaved to {out_path} (JSON array)"))
        elif ext == ".txt":
            _append_txt(out_path, snap)
            self.stdout.write(self.style.SUCCESS(f"\nSaved to {out_path} (TXT append)"))
        else:
            _append_jsonl(out_path, snap)
            self.stdout.write(
                self.style.WARNING(f"\nUnknown extension '{ext}', saved as JSONL to {out_path}")
            )

        last_n = int(opts.get("last") or 0)
        if last_n > 0:
            _show_last_from_file(out_path, last_n)
