"""
dynamic_parser.py
=================
Uniwersalny parser ogłoszeń HTML + wrapper `raw_data_cleaner()`.

Zwraca słownik:
    raw_text      – złączony tekst wszystkich sekcji
    image_links   – lista src grafik
    parse_data    – {
        description    … str  albo list[str]  (patrz sekcja „Opis”)
        status         … available / unavailable / reserved / None
        available_from … "YYYY-MM-DD" lub None
        <key:value>    … reszta cech
      }
"""

from __future__ import annotations

import itertools
import json
import logging
import re
from dataclasses import dataclass, field
from typing import Dict, List, Sequence, Tuple

from bs4 import BeautifulSoup, Tag

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# --------------------------------------------------------------------------- #
#  Konfiguracja                                                               #
# --------------------------------------------------------------------------- #

IMAGE_LINK_BLOCKLIST = {
    "placeholder.jpg", "dummy-image", "cookie-banner", "tracking-pixel",
    "contact", "svg", "gfx",
}

RAW_TEXT_STOPWORDS = {
    # nawigacja / UI
    "wróć udostępnij zapisz", "wróć", "zapisz", "udostępnij", "zdjęcia",
    "dodaj do ulubionych", "dodano do ulubionych", "skontaktuj się", "zadzwoń",
    # inne śmieci
    "drukuj", "email", "e-mail", "reklama", "poleć", "obserwuj", "mapa",
    "pokaż na mapie", "więcej", "mniej", "zobacz więcej",
    "zgłoś błąd lub naruszenie", "wiadomość została", "wysłana",
    "wysyłanie wiadomości", "wystąpił błąd w trakcie wysyłania wiadomości",
    "wyślij kolejną wiadomość", "administratorem danych osobowych jest",
    "cele przetwarzania i twoje prawa", "powyższa oferta ma charakter",
    "charakter poglądowy", "oferta nie stanowi",
}

# ↓ wyłapujemy przeróżne warianty nagłówka: „Opis…”, „Opis nieruchomości…”, itp.
DESCRIPTION_HDR_RE = re.compile(r"^\s*opis\b", re.I)

# status + data dostępności
AVAILABLE_KEYWORDS   = {"dostępny", "dostępna", "dostępne"}
UNAVAILABLE_KEYWORDS = {"wynajęte", "sprzedane", "zajęte", "niedostępne"}
RESERVED_KEYWORDS    = {"zarezerwowane", "rezerwacja"}
DATE_RE = re.compile(
    r"(?:od|od dnia)\s*(\d{4}-\d{2}-\d{2}|\d{2}[./-]\d{2}[./-]\d{4})",
    re.I,
)

# --------------------------------------------------------------------------- #
#  Helpery                                                                    #
# --------------------------------------------------------------------------- #


def _clean_url(link: str) -> str:
    try:
        link = json.loads(link)
    except Exception:
        pass
    return link.strip().strip('"').strip("'").replace("\\", "").rstrip("/")


def _normalize_space(txt: str) -> str:
    return re.sub(r"\s{2,}", " ", txt.strip())


def _is_key(txt: str) -> bool:
    return bool(re.match(r"^[A-ZĄĆĘŁŃÓŚŹŻa-ząćęłńóśźż0-9\s\-/]{2,30}$", txt.strip()))


def _maybe_split_kv(cell: str) -> Tuple[str, str] | None:
    m = re.match(
        r"^(?P<k>[A-Za-zĄĆĘŁŃÓŚŹŻąćęłńóśźż][^:]{1,30})[\s:]+(?P<v>.+)$", cell
    )
    return (m.group("k"), m.group("v").strip()) if m and _is_key(m.group("k")) else None


def _unique(seq: List[str]) -> List[str]:
    seen, out = set(), []
    for x in seq:
        if x not in seen:
            seen.add(x)
            out.append(x)
    return out


# --------------------------------------------------------------------------- #
#  Główny parser                                                              #
# --------------------------------------------------------------------------- #


@dataclass
class UniversalListingParser:
    html: str
    soup: BeautifulSoup = field(init=False)
    image_links: List[str] = field(init=False, default_factory=list)
    raw_lines: List[str] = field(init=False, default_factory=list)
    raw_sections: List[Dict[str, Sequence[str]]] = field(
        init=False, default_factory=list
    )
    key_value: Dict[str, str] = field(init=False, default_factory=dict)
    description: str | List[str] | None = field(init=False, default=None)
    status: str | None = field(init=False, default=None)  # available / unavailable / reserved
    available_from: str | None = field(init=False, default=None)

    # ------------------------- life-cycle ---------------------------------- #

    def __post_init__(self):
        self.soup = BeautifulSoup(self.html, "html.parser")
        self.image_links   = self._collect_images()
        self.raw_lines     = self._collect_lines()
        self._parse_structures()
        self._parse_free_text()
        self._fix_shifted_pairs()
        self._prune_garbage_keys()
        self._merge_sections()
        self._extract_description()
        self._detect_availability()

    # ------------------------- collectors ---------------------------------- #

    def _collect_images(self) -> List[str]:
        links = [
            _clean_url(img.get("src"))
            for img in self.soup.find_all("img")
            if img.get("src")
        ]
        return [
            l for l in links
            if not any(b in l.lower() for b in IMAGE_LINK_BLOCKLIST)
        ]

    def _collect_lines(self) -> List[str]:
        lines = self.soup.get_text("\n", strip=True).split("\n")
        return [
            _normalize_space(l)
            for l in lines
            if l and not any(sw in l.lower() for sw in RAW_TEXT_STOPWORDS)
        ]

    # ------------------------- structured parsers -------------------------- #

    def _parse_structures(self):
        for fn in (self._parse_tables, self._parse_dl, self._parse_divs):
            try:
                self.key_value.update(fn())
            except Exception as exc:
                logger.warning("%s failed: %s", fn.__name__, exc)

    def _parse_tables(self) -> Dict[str, str]:
        out, orphan = {}, None
        for table in self.soup.find_all("table"):
            for row in table.find_all("tr"):
                cells = [
                    c.get_text(strip=True)
                    for c in row.find_all(["td", "th"])
                    if c.get_text(strip=True)
                ]
                if not cells:
                    continue

                # klasyczna para <th>Klucz</th><td>Wartość</td>
                if len(cells) == 2 and _is_key(cells[0]):
                    out[cells[0]] = cells[1]
                    orphan = None
                    continue

                # <td>Klucz: Wartość</td>
                if len(cells) == 1:
                    cell = _normalize_space(cells[0])
                    merged = _maybe_split_kv(cell)
                    if merged:
                        out[merged[0]] = merged[1]
                        orphan = None
                        continue

                    if _is_key(cell):
                        orphan = cell
                        continue

                    if orphan:
                        out[orphan] = cell
                        orphan = None
                        continue

                # pozioma „tabela” w jednym wierszu (nagłówek + pary)
                if len(cells) >= 3 and len(cells[1:]) % 2 == 0:
                    title, rest = cells[0], cells[1:]
                    for k, v in zip(rest[::2], rest[1::2]):
                        if _is_key(k):
                            out[f"{title} - {k}"] = v
        return out

    def _parse_dl(self) -> Dict[str, str]:
        out = {}
        for dl in self.soup.find_all("dl"):
            for dt, dd in zip(dl.find_all("dt"), dl.find_all("dd")):
                k = _normalize_space(dt.get_text(" ", strip=True).rstrip(":"))
                v = _normalize_space(dd.get_text(" ", strip=True))
                if _is_key(k):
                    out[k] = v
        return out

    def _parse_divs(self) -> Dict[str, str]:
        out = {}
        for div in self.soup.find_all("div"):
            kids = [k for k in div.children if isinstance(k, Tag)]
            if len(kids) == 2:
                k = _normalize_space(kids[0].get_text(" ", strip=True).rstrip(":"))
                v = _normalize_space(kids[1].get_text(" ", strip=True))
                if _is_key(k):
                    out[k] = v
        return out

    # ------------------------- free-text (na żywca) ------------------------ #

    def _parse_free_text(self):
        """
        - identyfikujemy sekcje nagłówkami w stylu „XYZ:”
        - linie w sekcji „Opis …” **nie** są rozbijane na pary klucz:wartość
          (unikamy błędu z bullet-listą)
        """
        i, cur, title = 0, [], "Ogólne"
        in_desc = False

        while i < len(self.raw_lines):
            ln  = self.raw_lines[i]
            nxt = self.raw_lines[i + 1] if i + 1 < len(self.raw_lines) else ""
            low = ln.lower()

            # --- początek / koniec sekcji „Opis …” -------------------------
            if DESCRIPTION_HDR_RE.match(low):
                if cur:
                    self.raw_sections.append({"section": title, "lines": cur})
                title, cur = "Opis", []        # nagłówek pomijamy
                in_desc    = True
                i += 1
                continue

            # --- nowa sekcja poza opisem -----------------------------------
            if ln.endswith(":") and _is_key(ln.rstrip(":")):
                if cur:
                    self.raw_sections.append({"section": title, "lines": cur})
                title, cur = ln.rstrip(":"), []
                in_desc    = False
                i += 1
                continue

            # --- wycinamy pary KV TYLKO jeśli NIE jesteśmy w opisie --------
            if not in_desc and ":" in ln and not ln.endswith(":"):
                k, v = map(_normalize_space, ln.split(":", 1))
                if _is_key(k):
                    self.key_value.setdefault(k, v)
                cur.append(ln)
                i += 1
                continue

            # --- format wiersz-pod-wierszem Klucz / Wartość ---------------
            if (
                not in_desc
                and _is_key(ln)
                and nxt
                and not _is_key(nxt)
            ):
                self.key_value.setdefault(ln, nxt)
                cur.append(f"{ln}: {nxt}")
                i += 2
                continue

            # --- linia „zwykłego” tekstu -----------------------------------
            cur.append(ln)
            i += 1

        if cur:
            self.raw_sections.append({"section": title, "lines": cur})

    # ------------------------- post-processing ----------------------------- #

    def _fix_shifted_pairs(self):
        """
        Przesunięcie o komórkę – np. pierwsza kolumna zawiera wartości,
        a druga klucze.  Detekcja: w value jest „sensowny klucz”.
        """
        shifted = {
            v: k
            for k, v in self.key_value.items()
            if _is_key(v) and re.search(r"\d", k)
        }
        for k in shifted:
            self.key_value.pop(shifted[k], None)
        self.key_value.update(shifted)

    def _prune_garbage_keys(self):
        for k in list(self.key_value):
            v, low = self.key_value[k].strip(), k.lower()
            if (
                not v
                or v == k
                or (len(v) < 3 and not re.search(r"\d", v))
                or any(sw in low for sw in RAW_TEXT_STOPWORDS)
            ):
                self.key_value.pop(k)

    def _merge_sections(self):
        merged: dict[str, list[str]] = {}
        for s in self.raw_sections:
            merged.setdefault(s["section"], []).extend(s["lines"])
        self.raw_sections = [
            {"section": t, "lines": _unique(ls)} for t, ls in merged.items()
        ]

    # ------------------------- opis (string lub lista) --------------------- #

    def _extract_description(self):
        """
        1) Spróbujmy najpierw elegancko: po nagłówku <h?>„Opis …” zbieramy
           <ul>/<ol> i zwracamy list[str].
        2) Jeśli nie ma listy → fallback do dotychczasowego scalania w str.
        """
        # -- podejście 1: strukturalne -------------------------------------
        hdr = None
        for h in self.soup.find_all(re.compile(r"^h[1-6]$")):
            if DESCRIPTION_HDR_RE.match(_normalize_space(h.get_text(" ", strip=True))):
                hdr = h
                break

        if hdr:
            items: list[str] = []
            for sib in hdr.next_siblings:
                if isinstance(sib, Tag) and re.match(r"^h[1-6]$", sib.name):
                    break   # dotarliśmy do kolejnego nagłówka
                if isinstance(sib, Tag) and sib.name in {"ul", "ol"}:
                    items.extend(
                        _normalize_space(li.get_text(" ", strip=True))
                        for li in sib.find_all("li")
                        if li.get_text(strip=True)
                    )

            if items:
                self.description = items
                return

        # -- podejście 2: tekst linia-po-linii (jak do tej pory) ---------- #
        lines: list[str] = []
        for s in self.raw_sections:
            if s["section"].lower().startswith("opis"):
                lines.extend(
                    l for l in s["lines"]
                    if not l.lower().startswith(("rozwiń", "zobacz"))
                )
        self.description = _normalize_space(" ".join(lines)) or None

    # ------------------------- status / data dostępności ------------------ #

    def _detect_availability(self):
        joined = " ".join(self.raw_lines).lower()

        if any(w in joined for w in UNAVAILABLE_KEYWORDS):
            self.status = "unavailable"
        elif any(w in joined for w in RESERVED_KEYWORDS):
            self.status = "reserved"
        elif any(w in joined for w in AVAILABLE_KEYWORDS):
            self.status = "available"

        # „dostępne od …”
        for ln in self.raw_lines:
            m = DATE_RE.search(ln.lower())
            if m:
                raw = m.group(1).replace("/", "-").replace(".", "-")
                parts = raw.split("-")
                self.available_from = (
                    raw if len(parts[0]) == 4 else f"{parts[2]}-{parts[1]}-{parts[0]}"
                )
                break

    # ------------------------- dunder ------------------------------------- #

    def __repr__(self) -> str:
        preview = ", ".join(
            itertools.islice((f"{k}={v}" for k, v in self.key_value.items()), 5)
        )
        return (
            f"<UniversalListingParser {len(self.key_value)} KV, "
            f"{len(self.image_links)} imgs | desc={'list' if isinstance(self.description, list) else 'str'}>"
        )


# --------------------------------------------------------------------------- #
#  Wrapper dla pipeline-u                                                     #
# --------------------------------------------------------------------------- #


def raw_data_cleaner(html: str) -> dict:
    """
    Owijka pod użycie w Playwright-owym pipeline-ie.

    * description może być albo stringiem, albo listą łańcuchów
      – zależnie od tego, czy w HTML-u użyto <ul>/<li>.
    """
    parser = UniversalListingParser(html)

    flat_txt = "\n".join(
        ln for sec in parser.raw_sections for ln in sec["lines"]
    )

    parse_data = {
        "description":    parser.description,
        "status":         parser.status,
        "available_from": parser.available_from,
        **parser.key_value,
    }
    # wyrzuć None-y
    parse_data = {k: v for k, v in parse_data.items() if v is not None}

    return {
        "raw_text":    flat_txt,
        "image_links": parser.image_links,
        "parse_data":  parse_data,
    }
