# -*- coding: utf-8 -*-
# your_app/management/commands/images_normalize.py
#
# Przykład użycia:
#   python manage.py images_normalize \
#     --model extractly.AdsManual \
#     --field images \
#     --dry-run
#
#   python manage.py images_normalize \
#     --model extractly.AdsManual \
#     --field images

import re
import itertools
from typing import Optional, Any, List

from django.apps import apps
from django.core.management.base import BaseCommand, CommandError
from django.db import transaction

DEFAULT_ALLOWED_PREFIXES = [
    "https://extractly.s3.waw.io.cloud.ovh.net/",
    # dopisz aliasy, jeśli masz:
    # "https://cdn.extractly.cloud/",
]

def _strip_wrappers(s: str) -> str:
    """
    Normalizuje dziwne warianty stringów:
    - usuwa wiodące/trailing spacje
    - usuwa wiodące b'/u' (b"...", u'...')
    - zdejmuje zewnętrzny pojedynczy/podwójny cudzysłów, także z sekwencjami \" / \'
    - od-unescapowuje \" -> ", \' -> '
    """
    if not isinstance(s, str):
        s = str(s)

    s = s.strip()

    # Usuń ewentualne prefiksy b'...' / u'...' / b"..." / u"..."
    if (len(s) >= 3) and s[0].lower() in ("b", "u") and s[1] in ("'", '"'):
        # np. b'...' lub u"..."
        s = s[2:]

    # Jeśli zaczyna się i/lub kończy cytatem/cudzysłowem, zdejmij go
    # Obsłuż też wariant z backslashami na brzegach: \"...\" lub \'...\'
    # 1) usuń backslash przed brzegowymi cudzysłowami
    if s.startswith(r'\"') and s.endswith(r'\"') and len(s) >= 4:
        s = s[2:-2]
    elif s.startswith(r"\'") and s.endswith(r"\'") and len(s) >= 4:
        s = s[2:-2]
    else:
        # zwykłe brzegowe "..." lub '...'
        if len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'):
            s = s[1:-1]

    # od-unescape
    # najpierw popularne sekwencje
    s = s.replace(r'\"', '"').replace(r"\'", "'")

    # Jeśli ktoś zapisał JSON-owo, czasem trafiają się podwójne backslashe \\"
    # zdejmij nadmiarowe backslashe przed cudzysłowem
    s = re.sub(r'\\+"', '"', s)
    s = re.sub(r"\\+'", "'", s)

    # jeszcze raz przytnij spacje
    s = s.strip()
    return s

def _is_url(s: str) -> bool:
    return isinstance(s, str) and (s.startswith("http://") or s.startswith("https://"))

def _is_ours(s: str, allowed: List[str]) -> bool:
    if not _is_url(s):
        return False
    return any(s.startswith(p) for p in allowed)

def _normalize_value(value: Any, allowed: List[str], only_ours: bool) -> Optional[Any]:
    """
    Zwraca:
      - string (czysty URL) jeśli po normalizacji jest URL (i jeśli only_ours: to nasz),
      - listę URL-i (tylko nasze, dedupe) jeśli wejście to lista/dict z wieloma,
      - None jeżeli nic nie zostaje sensownego.
    """
    if value is None:
        return None

    # STRING
    if isinstance(value, str):
        s = _strip_wrappers(value)
        if not _is_url(s):
            return None
        if only_ours and not any(s.startswith(p) for p in allowed):
            return None
        return s

    # DICT
    if isinstance(value, dict):
        collected: List[str] = []
        # popularne klucze
        for k in ("main", "url", "src", "href"):
            v = value.get(k)
            if isinstance(v, str):
                s = _strip_wrappers(v)
                if _is_url(s) and (not only_ours or _is_ours(s, allowed)):
                    collected.append(s)
            elif isinstance(v, (list, tuple)):
                for x in v:
                    if isinstance(x, str):
                        s = _strip_wrappers(x)
                        if _is_url(s) and (not only_ours or _is_ours(s, allowed)):
                            collected.append(s)
        # fallback: wszystkie stringi z dicta
        for v in value.values():
            if isinstance(v, str):
                s = _strip_wrappers(v)
                if _is_url(s) and (not only_ours or _is_ours(s, allowed)):
                    collected.append(s)
            elif isinstance(v, (list, tuple)):
                for x in v:
                    if isinstance(x, str):
                        s = _strip_wrappers(x)
                        if _is_url(s) and (not only_ours or _is_ours(s, allowed)):
                            collected.append(s)
        # dedupe – zachowaj kolejność
        collected = list(dict.fromkeys(collected))
        if not collected:
            return None
        # Jeśli zebraliśmy wiele, zwykle chcemy pierwszy
        return collected[0] if len(collected) == 1 else collected

    # LISTA
    if isinstance(value, (list, tuple)):
        collected: List[str] = []
        for item in value:
            if isinstance(item, str):
                s = _strip_wrappers(item)
                if _is_url(s) and (not only_ours or _is_ours(s, allowed)):
                    collected.append(s)
            elif isinstance(item, dict):
                # rekurencyjnie przetwórz dict
                nv = _normalize_value(item, allowed, only_ours)
                if isinstance(nv, str) and (not only_ours or _is_ours(nv, allowed)):
                    collected.append(nv)
                elif isinstance(nv, (list, tuple)):
                    for x in nv:
                        if isinstance(x, str) and (not only_ours or _is_ours(x, allowed)):
                            collected.append(x)
        collected = list(dict.fromkeys(collected))
        if not collected:
            return None
        return collected[0] if len(collected) == 1 else collected

    # Inne typy nas nie interesują
    return None


class Command(BaseCommand):
    help = (
        "Normalizuje pole 'images' (usuwa podwójne cudzysłowy/escape, b''/u''). "
        "Czyści stringi typu \"\\\"https://...jpg\\\"\" do czystego URL."
    )

    def add_arguments(self, parser):
        parser.add_argument("--model", default="extractly.AdsManual",
                            help="app_label.ModelName (domyślnie: extractly.AdsManual)")
        parser.add_argument("--field", default="images",
                            help="Nazwa pola z obrazkami (domyślnie: images)")
        parser.add_argument("--batch-size", type=int, default=5000,
                            help="Rozmiar batcha (domyślnie 5000)")
        parser.add_argument("--dry-run", action="store_true",
                            help="Pokaż co będzie zmienione, bez zapisu.")
        parser.add_argument("--only-ours", action="store_true",
                            help="Po normalizacji zostaw tylko URL-e z dozwolonym prefiksem (-s).")
        parser.add_argument("--allow-prefix", action="append", dest="allowed_prefixes",
                            help="Dozwolony prefiks (można podać wiele razy). Domyślnie tylko OVH.")

    def handle(self, *args, **opts):
        model_label = opts["model"]
        field_name = opts["field"]
        batch_size = opts["batch_size"]
        dry = bool(opts["dry_run"])
        only_ours = bool(opts["only_ours"])
        allowed = opts.get("allowed_prefixes") or DEFAULT_ALLOWED_PREFIXES
        allowed = [p if p.endswith("/") else p + "/" for p in allowed]

        try:
            Model = apps.get_model(model_label)
        except LookupError:
            raise CommandError(f"Model '{model_label}' nie istnieje.")

        if field_name not in {f.name for f in Model._meta.get_fields()}:
            raise CommandError(f"Pole '{field_name}' nie istnieje w modelu {model_label}.")

        qs = Model.objects.filter(**{f"{field_name}__isnull": False}).order_by("pk")

        total = qs.count()
        self.stdout.write(f"Kandydatów (images != NULL): {total}")
        processed = changed = nulled = 0

        it = qs.values("pk", field_name).iterator(chunk_size=batch_size)
        while True:
            chunk = list(itertools.islice(it, batch_size))
            if not chunk:
                break

            to_update = []
            to_null = []

            for row in chunk:
                pk = row["pk"]
                val = row[field_name]

                new_val = _normalize_value(val, allowed, only_ours)

                if new_val is None:
                    # jeżeli po normalizacji nic sensownego — czyścimy
                    to_null.append(pk)
                else:
                    # tylko jeśli różni się od starego
                    if new_val != val:
                        to_update.append((pk, new_val))

            processed += len(chunk)
            changed += len(to_update)
            nulled += len(to_null)

            if not dry:
                with transaction.atomic():
                    if to_null:
                        Model.objects.filter(pk__in=to_null).update(**{field_name: None})
                    for pk, nv in to_update:
                        Model.objects.filter(pk=pk).update(**{field_name: nv})

            self.stdout.write(
                f"processed={processed}/{total} | to_update={len(to_update)} | to_null={len(to_null)}"
            )

        if dry:
            self.stdout.write(self.style.SUCCESS(
                f"[DRY] DONE | changed={changed} | nulled={nulled}"
            ))
        else:
            self.stdout.write(self.style.SUCCESS(
                f"DONE | changed={changed} | nulled={nulled}"
            ))