# -*- coding: utf-8 -*-
# your_app/management/commands/images_prefixes.py

import itertools
from collections import Counter, defaultdict
from urllib.parse import urlparse

from django.apps import apps
from django.core.management.base import BaseCommand
from django.db.models import F
from django.db.models.functions import Cast
from django.db.models import CharField

class Command(BaseCommand):
    """
    Skanuje pole images i wypisuje prefiksy URL + liczności.
    Obsługuje: string, dict (main/url/src/href), listę tych elementów.
    """
    help = "Zbiera i zlicza prefiksy URL z pola 'images'."

    def add_arguments(self, parser):
        parser.add_argument("--model", default="extractly.AdsManual",
                            help="app_label.ModelName (domyślnie: extractly.AdsManual)")
        parser.add_argument("--images-field", default="images",
                            help="Nazwa pola (domyślnie: images)")
        parser.add_argument("--batch-size", type=int, default=5000,
                            help="Wielkość batcha (domyślnie 5000)")
        parser.add_argument("--limit", type=int, default=0,
                            help="Maks. liczba rekordów do przejrzenia (0=bez limitu)")
        parser.add_argument("--mode", choices=["domain", "hostpath1", "hostpath2", "fullpath"],
                            default="domain",
                            help=(
                                "Jak budować prefiks:\n"
                                " - domain: scheme://host/\n"
                                " - hostpath1: scheme://host/segment1/\n"
                                " - hostpath2: scheme://host/segment1/segment2/\n"
                                " - fullpath: scheme://host/pełna_ścieżka_bez_pliku\n"
                            ))
        parser.add_argument("--top", type=int, default=30,
                            help="Pokaż TOP N prefiksów (domyślnie 30)")
        parser.add_argument("--csv", default="",
                            help="Ścieżka do CSV z wynikami (opcjonalnie)")

    def handle(self, *args, **opts):
        model_label = opts["model"]
        field_name = opts["images_field"]
        batch = opts["batch_size"]
        limit = int(opts["limit"] or 0)
        mode = opts["mode"]
        topn = int(opts["top"] or 30)
        csv_path = opts["csv"]

        Model = apps.get_model(model_label)
        # walidacja pola
        field_names = {f.name for f in Model._meta.get_fields()}
        if field_name not in field_names:
            raise SystemExit(f"Pole '{field_name}' nie istnieje w {model_label}")

        # Bierzemy tylko nie-NULL
        qs = (Model.objects
              .filter(**{f"{field_name}__isnull": False})
              .annotate(_img_text=Cast(F(field_name), output_field=CharField()))
              .order_by("pk"))

        total = qs.count()
        self.stdout.write(f"Rekordów z {field_name} != NULL: {total}")

        # Pomoc: wyciąganie URL-i z różnych kształtów
        def extract_urls(value):
            urls = []
            if value is None:
                return urls
            if isinstance(value, str):
                v = value.strip()
                if v.startswith("http://") or v.startswith("https://"):
                    urls.append(v)
                return urls
            if isinstance(value, dict):
                for k in ("main", "url", "src", "href"):
                    v = value.get(k)
                    if isinstance(v, str) and (v.startswith("http://") or v.startswith("https://")):
                        urls.append(v.strip())
                    elif isinstance(v, (list, tuple)):
                        for x in v:
                            if isinstance(x, str) and (x.startswith("http://") or x.startswith("https://")):
                                urls.append(x.strip())
                # fallback: wszystkie stringi w dict
                for v in value.values():
                    if isinstance(v, str) and (v.startswith("http://") or v.startswith("https://")):
                        urls.append(v.strip())
                return urls
            if isinstance(value, (list, tuple)):
                for item in value:
                    if isinstance(item, str) and (item.startswith("http://") or item.startswith("https://")):
                        urls.append(item.strip())
                    elif isinstance(item, dict):
                        for k in ("main", "url", "src", "href"):
                            v = item.get(k)
                            if isinstance(v, str) and (v.startswith("http://") or v.startswith("https://")):
                                urls.append(v.strip())
            return urls

        def prefix_for(url: str) -> str:
            p = urlparse(url)
            host = f"{p.scheme}://{p.netloc}/"
            if mode == "domain":
                return host
            # path bez końcowego pliku
            path = (p.path or "").lstrip("/")
            segments = [seg for seg in path.split("/") if seg]
            if mode == "hostpath1":
                return host + (segments[0] + "/" if segments else "")
            if mode == "hostpath2":
                if len(segments) >= 2:
                    return host + segments[0] + "/" + segments[1] + "/"
                return host + (segments[0] + "/" if segments else "")
            if mode == "fullpath":
                # bez nazwy pliku: utnij ostatni segment jeśli wygląda na plik (z kropką)
                if segments and "." in segments[-1]:
                    segments = segments[:-1]
                return host + ("/".join(segments) + ("/" if segments else ""))
            return host  # fallback

        counter = Counter()
        samples = defaultdict(list)

        processed = 0
        it = qs.values("pk", field_name).iterator(chunk_size=batch)
        while True:
            chunk = list(itertools.islice(it, batch))
            if not chunk:
                break
            for row in chunk:
                if limit and processed >= limit:
                    break
                processed += 1
                urls = extract_urls(row[field_name])
                for u in urls:
                    pref = prefix_for(u)
                    counter[pref] += 1
                    if len(samples[pref]) < 3:  # parę przykładów
                        samples[pref].append(u)
            if limit and processed >= limit:
                break

        self.stdout.write(f"Przeprocesowano rekordów: {processed}")

        if not counter:
            self.stdout.write("Brak URL-i do zliczenia.")
            return

        # TOP N
        self.stdout.write("\nTOP prefiksy:")
        for pref, cnt in counter.most_common(topn):
            ex = "; ".join(samples[pref])
            self.stdout.write(f"{cnt:8d}  {pref}   # {ex}")

        # CSV raport (opcjonalnie)
        if csv_path:
            import csv
            with open(csv_path, "w", newline="", encoding="utf-8") as f:
                w = csv.writer(f)
                w.writerow(["prefix", "count", "sample1", "sample2", "sample3"])
                for pref, cnt in counter.most_common():
                    ss = samples[pref]
                    w.writerow([pref, cnt, ss[0] if len(ss) > 0 else "", ss[1] if len(ss) > 1 else "", ss[2] if len(ss) > 2 else ""])
            self.stdout.write(self.style.SUCCESS(f"Zapisano CSV: {csv_path}"))
