
# manual_agregator/images.py
import re
import json
import io
import hashlib
import mimetypes
import logging
from collections.abc import Iterable as CollIterable
from typing import Any, Optional, Tuple
from urllib.parse import urljoin, urlparse
import httpx
from django.db import transaction
from django.utils import timezone
from cloud_storage.services import upload_to_ovh
logger = logging.getLogger(__name__)


ALLOWED_MIME = {"image/jpeg", "image/png", "image/webp", "image/gif"}
CONTENT_TYPE_FIX = {"image/jpg": "image/jpeg"}
MAX_BYTES = 20 * 1024 * 1024
HTTP_TIMEOUT = httpx.Timeout(20.0, connect=6.0, read=20.0)

PLACEHOLDER_SUBSTRINGS = (
    "no-photo", "no_photo", "placeholder", "/og_image_main.jpg", "/static-gh/", "brak_zdjecia", "logo.jpg", "/gfx/logotypy/",
)

IMG_URL_RE = re.compile(
    r'(?P<url>https?://[^\s"\'<>]+?\.(?:jpg|jpeg|png|webp|gif))(?:[?#][^\s"\'<>]*)?',
    re.IGNORECASE
)

def _is_placeholder(u: str) -> bool:
    lu = (u or "").lower()
    return any(ph in lu for ph in PLACEHOLDER_SUBSTRINGS)

def _looks_like_json(s: str) -> bool:
    s = (s or "").strip()
    return (s.startswith("[") and s.endswith("]")) or (s.startswith("{") and s.endswith("}"))

def _fix_scheme_slashes(u: str) -> str:
    # napraw "https:/domain" -> "https://domain" (tylko gdy jest pojedynczy slash)
    if u.lower().startswith("https:/") and not u.lower().startswith("https://"):
        return "https://" + u[7:]
    if u.lower().startswith("http:/") and not u.lower().startswith("http://"):
        return "http://" + u[6:]
    return u

def _strip_quotes(u: str) -> str:
    # usuń otaczające cudzysłowy / spacje
    return (u or "").strip().strip('\'"').strip()

def _extract_img_url_from_text(s: str) -> Optional[str]:
    """
    Jeśli w stringu jest „page URL” + cytowany URL obrazka,
    wyciągnij pierwszy dopasowany URL obrazka.
    """
    m = IMG_URL_RE.search(s or "")
    if m:
        return m.group("url")
    return None

def _normalize_candidate_str(raw: str, base_url: str) -> Optional[str]:
    """
    Zwraca poprawny absolutny URL obrazka dla „dziwnych” stringów:
    - usuwa cudzysłowy,
    - naprawia https:/ -> https://,
    - wycina URL obrazka z tekstu, jeśli są śmieci dookoła,
    - stosuje urljoin TYLKO dla ścieżek względnych.
    """
    if not raw:
        return None

    s = _strip_quotes(raw)
    # jeśli w środku jest http(s)://...*.jpg/png/webp/gif – wyciągnij
    extracted = _extract_img_url_from_text(s)
    s = extracted or s

    # napraw schemat slashes
    s = _fix_scheme_slashes(s)

    # absolutny?
    low = s.lower()
    if low.startswith("http://") or low.startswith("https://"):
        return None if _is_placeholder(s) else s

    # względny?
    if s.startswith("/"):
        absu = urljoin(base_url or "", s)
        return None if _is_placeholder(absu) else absu

    # inne przypadki – odrzuć
    return None

def _first_image_url(raw: Any, base_url: str) -> Optional[str]:
    """
    Zwraca absolutny URL pierwszego sensownego obrazka.
    Obsługuje:
      - str (również „json w stringu”, np. '["https://..."]' albo '{"url":"..."}'),
      - dict: klucze url/src/href,
      - list/iterable: stringi lub dicty z ww. kluczami.
    Placeholdery są pomijane.
    """
    if not raw:
        return None

    # 1) String – może być czysty URL, ścieżka względna, albo JSON w tekście
    if isinstance(raw, str):
        s = raw.strip()
        if not s:
            return None
        if _looks_like_json(s):
            try:
                raw = json.loads(s)
            except Exception:
                # spróbuj chociaż wyciągnąć URL obrazka z tekstu
                cand = _normalize_candidate_str(s, base_url)
                return cand
        else:
            cand = _normalize_candidate_str(s, base_url)
            return cand

    # 2) Dict – klasyczne pola url/src/href
    if isinstance(raw, dict):
        for k in ("url", "src", "href"):
            v = raw.get(k)
            if v:
                cand = _normalize_candidate_str(str(v), base_url)
                if cand:
                    return cand

    # 3) Iterable – lista stringów/dictów
    if isinstance(raw, CollIterable) and not isinstance(raw, (str, bytes, dict)):
        for item in raw:
            # item może być stringiem z JSON-em, z cudzysłowem lub ścieżką względną
            if isinstance(item, str) and item.strip():
                # jeśli to string „jsonopodobny”
                if _looks_like_json(item):
                    try:
                        j = json.loads(item)
                        cand = _first_image_url(j, base_url)
                        if cand:
                            return cand
                    except Exception:
                        pass
                # zwykły string – spróbuj znormalizować
                cand = _normalize_candidate_str(item, base_url)
                if cand:
                    return cand
            # dict w liście
            if isinstance(item, dict):
                for k in ("url", "src", "href"):
                    v = item.get(k)
                    if v:
                        cand = _normalize_candidate_str(str(v), base_url)
                        if cand:
                            return cand

    return None


def _ext_from_content_type(ct: str) -> str:
    ct = CONTENT_TYPE_FIX.get((ct or "").strip().lower(), (ct or "").strip().lower())
    if ct == "image/jpeg":
        return ".jpg"
    if ct == "image/png":
        return ".png"
    if ct == "image/webp":
        return ".webp"
    if ct == "image/gif":
        return ".gif"
    ext = mimetypes.guess_extension(ct) or ".bin"
    return ".jpg" if ext in (".jpe",) else ext


def _safe_download(url: str) -> Tuple[bytes, str]:
    parsed = urlparse(url)
    referer = f"{parsed.scheme}://{parsed.netloc}/" if parsed.scheme and parsed.netloc else None
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124 Safari/537.36",
        "Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*;q=0.8,*/*;q=0.5",
    }
    if referer:
        headers["Referer"] = referer

    with httpx.Client(timeout=HTTP_TIMEOUT, follow_redirects=True, headers=headers) as client:
        with client.stream("GET", url) as resp:
            resp.raise_for_status()

            ct = (resp.headers.get("content-type", "") or "").split(";")[0].strip().lower()
            ct = CONTENT_TYPE_FIX.get(ct, ct)

            if ct not in ALLOWED_MIME:
                path = (parsed.path or "").lower()
                if path.endswith((".jpg", ".jpeg")):
                    ct = "image/jpeg"
                elif path.endswith(".png"):
                    ct = "image/png"
                elif path.endswith(".webp"):
                    ct = "image/webp"
                elif path.endswith(".gif"):
                    ct = "image/gif"
                else:
                    raise ValueError(f"Unsupported content-type: {ct or 'unknown'}")

            buf = io.BytesIO()
            for chunk in resp.iter_bytes():
                if not chunk:
                    continue
                buf.write(chunk)
                if buf.tell() > MAX_BYTES:
                    raise ValueError(f"Image too large > {MAX_BYTES} bytes")

            return buf.getvalue(), ct





def _flag_for_status_check(instance, *, note: str = "") -> None:
    """
    Oznacz ogłoszenie do sprawdzenia stanu (status-only):
      - check_active=True  -> trafi do get_flagged_pages(...)
      - check_active_from_image=True -> wiemy, że powodem była awaria obrazka
    Nic nie zmieniamy w is_active tutaj – zajmie się tym checker.
    """
    try:
        # ustaw flagi, ale nie nadpisuj innych pól
        instance.check_active = True
        if hasattr(instance, "check_active_from_image"):
            instance.check_active_from_image = True

        # opcjonalnie dopisz ślad w meta, jeśli masz pole meta (często bywa JSONField)
        if hasattr(instance, "meta"):
            meta = getattr(instance, "meta") or {}
            meta.setdefault("image_failures", [])
            if note:
                meta["image_failures"].append(
                    {"ts": timezone.now().isoformat(), "note": note}
                )
            instance.meta = meta

        # zapisz tylko istniejące pola
        update_fields = ["check_active"]
        if hasattr(instance, "check_active_from_image"):
            update_fields.append("check_active_from_image")
        if hasattr(instance, "meta"):
            update_fields.append("meta")

        instance.save(update_fields=update_fields)
        logger.info("Flagged for status-check (image failure) id=%s url=%s", getattr(instance, "id", None), getattr(instance, "url", None))
    except Exception as e:
        logger.warning("Failed to flag instance for status-check (image failure): %s", e)


def store_main_image(instance) -> Optional[str]:
    """
    Pobiera pierwszy obrazek z instance.original_image_urls, wysyła do OVH
    i zapisuje **string** (publiczny URL) w polu `images`.

    Jeśli nie uda się znaleźć/pobrać/wysłać zdjęcia:
      - oznaczamy ogłoszenie do sprawdzenia: check_active=True, check_active_from_image=True
      - NIE zmieniamy od razu is_active – zrobi to checker.
    """
    base_url = getattr(instance, "url", "") or ""
    originals = getattr(instance, "original_image_urls", None)
    first_url = _first_image_url(originals, base_url=base_url)
    if not first_url:
        logger.info("No image candidates for instance id=%s", getattr(instance, "id", None))
        _flag_for_status_check(instance, note="no_image_candidates")
        return None

    try:
        data, ct = _safe_download(first_url)
    except Exception as e:
        logger.warning("Download image failed for %s (id=%s): %s", first_url, getattr(instance, "id", None), e)
        _flag_for_status_check(instance, note=f"download_error:{type(e).__name__}")
        return None

    sha = hashlib.sha256(data).hexdigest()[:16]
    ext = _ext_from_content_type(ct)
    cls = instance.__class__.__name__.lower()
    obj_id = getattr(instance, "id", "unknown")
    ymd = timezone.now().strftime("%Y/%m/%d")
    object_key = f"images/{cls}/{obj_id}/{ymd}/{sha}{ext}"

    try:
        public_url = upload_to_ovh(
            data,
            object_key,
            content_type=ct,
            cache_control="public, max-age=31536000, immutable",
        )
    except Exception as e:
        logger.error("Upload to OVH failed for key=%s (id=%s): %s", object_key, getattr(instance, "id", None), e, exc_info=True)
        _flag_for_status_check(instance, note=f"upload_error:{type(e).__name__}")
        return None

    if not public_url:
        logger.error("OVH upload returned empty URL for key=%s (id=%s)", object_key, getattr(instance, "id", None))
        _flag_for_status_check(instance, note="upload_empty_url")
        return None

    # zapisujemy **sam URL** (string), bez dotykania flag
    with transaction.atomic():
        instance.images = public_url
        instance.save(update_fields=["images"])

    logger.info("Stored main image for id=%s -> %s", getattr(instance, "id", None), public_url)
    return public_url
