""" dynamic_parser.py ================= Uniwersalny parser ogłoszeń HTML + wrapper `raw_data_cleaner()`. Zwraca słownik: raw_text – złączony tekst wszystkich sekcji image_links – lista src grafik parse_data – { description … str albo list[str] (patrz sekcja „Opis”) status … available / unavailable / reserved / None available_from … "YYYY-MM-DD" lub None … reszta cech } """ from __future__ import annotations import itertools import json import logging import re from dataclasses import dataclass, field from typing import Dict, List, Sequence, Tuple from bs4 import BeautifulSoup, Tag logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # # Konfiguracja # # --------------------------------------------------------------------------- # IMAGE_LINK_BLOCKLIST = { "placeholder.jpg", "dummy-image", "cookie-banner", "tracking-pixel", "contact", "svg", "gfx", } RAW_TEXT_STOPWORDS = { # nawigacja / UI "wróć udostępnij zapisz", "wróć", "zapisz", "udostępnij", "zdjęcia", "dodaj do ulubionych", "dodano do ulubionych", "skontaktuj się", "zadzwoń", # inne śmieci "drukuj", "email", "e-mail", "reklama", "poleć", "obserwuj", "mapa", "pokaż na mapie", "więcej", "mniej", "zobacz więcej", "zgłoś błąd lub naruszenie", "wiadomość została", "wysłana", "wysyłanie wiadomości", "wystąpił błąd w trakcie wysyłania wiadomości", "wyślij kolejną wiadomość", "administratorem danych osobowych jest", "cele przetwarzania i twoje prawa", "powyższa oferta ma charakter", "charakter poglądowy", "oferta nie stanowi", } # ↓ wyłapujemy przeróżne warianty nagłówka: „Opis…”, „Opis nieruchomości…”, itp. DESCRIPTION_HDR_RE = re.compile(r"^\s*opis\b", re.I) # status + data dostępności AVAILABLE_KEYWORDS = {"dostępny", "dostępna", "dostępne"} UNAVAILABLE_KEYWORDS = {"wynajęte", "sprzedane", "zajęte", "niedostępne"} RESERVED_KEYWORDS = {"zarezerwowane", "rezerwacja"} DATE_RE = re.compile( r"(?:od|od dnia)\s*(\d{4}-\d{2}-\d{2}|\d{2}[./-]\d{2}[./-]\d{4})", re.I, ) # --------------------------------------------------------------------------- # # Helpery # # --------------------------------------------------------------------------- # def _clean_url(link: str) -> str: try: link = json.loads(link) except Exception: pass return link.strip().strip('"').strip("'").replace("\\", "").rstrip("/") def _normalize_space(txt: str) -> str: return re.sub(r"\s{2,}", " ", txt.strip()) def _is_key(txt: str) -> bool: return bool(re.match(r"^[A-ZĄĆĘŁŃÓŚŹŻa-ząćęłńóśźż0-9\s\-/]{2,30}$", txt.strip())) def _maybe_split_kv(cell: str) -> Tuple[str, str] | None: m = re.match( r"^(?P[A-Za-zĄĆĘŁŃÓŚŹŻąćęłńóśźż][^:]{1,30})[\s:]+(?P.+)$", cell ) return (m.group("k"), m.group("v").strip()) if m and _is_key(m.group("k")) else None def _unique(seq: List[str]) -> List[str]: seen, out = set(), [] for x in seq: if x not in seen: seen.add(x) out.append(x) return out # --------------------------------------------------------------------------- # # Główny parser # # --------------------------------------------------------------------------- # @dataclass class UniversalListingParser: html: str soup: BeautifulSoup = field(init=False) image_links: List[str] = field(init=False, default_factory=list) raw_lines: List[str] = field(init=False, default_factory=list) raw_sections: List[Dict[str, Sequence[str]]] = field( init=False, default_factory=list ) key_value: Dict[str, str] = field(init=False, default_factory=dict) description: str | List[str] | None = field(init=False, default=None) status: str | None = field(init=False, default=None) # available / unavailable / reserved available_from: str | None = field(init=False, default=None) # ------------------------- life-cycle ---------------------------------- # def __post_init__(self): self.soup = BeautifulSoup(self.html, "html.parser") self.image_links = self._collect_images() self.raw_lines = self._collect_lines() self._parse_structures() self._parse_free_text() self._fix_shifted_pairs() self._prune_garbage_keys() self._merge_sections() self._extract_description() self._detect_availability() # ------------------------- collectors ---------------------------------- # def _collect_images(self) -> List[str]: links = [ _clean_url(img.get("src")) for img in self.soup.find_all("img") if img.get("src") ] return [ l for l in links if not any(b in l.lower() for b in IMAGE_LINK_BLOCKLIST) ] def _collect_lines(self) -> List[str]: lines = self.soup.get_text("\n", strip=True).split("\n") return [ _normalize_space(l) for l in lines if l and not any(sw in l.lower() for sw in RAW_TEXT_STOPWORDS) ] # ------------------------- structured parsers -------------------------- # def _parse_structures(self): for fn in (self._parse_tables, self._parse_dl, self._parse_divs): try: self.key_value.update(fn()) except Exception as exc: logger.warning("%s failed: %s", fn.__name__, exc) def _parse_tables(self) -> Dict[str, str]: out, orphan = {}, None for table in self.soup.find_all("table"): for row in table.find_all("tr"): cells = [ c.get_text(strip=True) for c in row.find_all(["td", "th"]) if c.get_text(strip=True) ] if not cells: continue # klasyczna para KluczWartość if len(cells) == 2 and _is_key(cells[0]): out[cells[0]] = cells[1] orphan = None continue # Klucz: Wartość if len(cells) == 1: cell = _normalize_space(cells[0]) merged = _maybe_split_kv(cell) if merged: out[merged[0]] = merged[1] orphan = None continue if _is_key(cell): orphan = cell continue if orphan: out[orphan] = cell orphan = None continue # pozioma „tabela” w jednym wierszu (nagłówek + pary) if len(cells) >= 3 and len(cells[1:]) % 2 == 0: title, rest = cells[0], cells[1:] for k, v in zip(rest[::2], rest[1::2]): if _is_key(k): out[f"{title} - {k}"] = v return out def _parse_dl(self) -> Dict[str, str]: out = {} for dl in self.soup.find_all("dl"): for dt, dd in zip(dl.find_all("dt"), dl.find_all("dd")): k = _normalize_space(dt.get_text(" ", strip=True).rstrip(":")) v = _normalize_space(dd.get_text(" ", strip=True)) if _is_key(k): out[k] = v return out def _parse_divs(self) -> Dict[str, str]: out = {} for div in self.soup.find_all("div"): kids = [k for k in div.children if isinstance(k, Tag)] if len(kids) == 2: k = _normalize_space(kids[0].get_text(" ", strip=True).rstrip(":")) v = _normalize_space(kids[1].get_text(" ", strip=True)) if _is_key(k): out[k] = v return out # ------------------------- free-text (na żywca) ------------------------ # def _parse_free_text(self): """ - identyfikujemy sekcje nagłówkami w stylu „XYZ:” - linie w sekcji „Opis …” **nie** są rozbijane na pary klucz:wartość (unikamy błędu z bullet-listą) """ i, cur, title = 0, [], "Ogólne" in_desc = False while i < len(self.raw_lines): ln = self.raw_lines[i] nxt = self.raw_lines[i + 1] if i + 1 < len(self.raw_lines) else "" low = ln.lower() # --- początek / koniec sekcji „Opis …” ------------------------- if DESCRIPTION_HDR_RE.match(low): if cur: self.raw_sections.append({"section": title, "lines": cur}) title, cur = "Opis", [] # nagłówek pomijamy in_desc = True i += 1 continue # --- nowa sekcja poza opisem ----------------------------------- if ln.endswith(":") and _is_key(ln.rstrip(":")): if cur: self.raw_sections.append({"section": title, "lines": cur}) title, cur = ln.rstrip(":"), [] in_desc = False i += 1 continue # --- wycinamy pary KV TYLKO jeśli NIE jesteśmy w opisie -------- if not in_desc and ":" in ln and not ln.endswith(":"): k, v = map(_normalize_space, ln.split(":", 1)) if _is_key(k): self.key_value.setdefault(k, v) cur.append(ln) i += 1 continue # --- format wiersz-pod-wierszem Klucz / Wartość --------------- if ( not in_desc and _is_key(ln) and nxt and not _is_key(nxt) ): self.key_value.setdefault(ln, nxt) cur.append(f"{ln}: {nxt}") i += 2 continue # --- linia „zwykłego” tekstu ----------------------------------- cur.append(ln) i += 1 if cur: self.raw_sections.append({"section": title, "lines": cur}) # ------------------------- post-processing ----------------------------- # def _fix_shifted_pairs(self): """ Przesunięcie o komórkę – np. pierwsza kolumna zawiera wartości, a druga klucze. Detekcja: w value jest „sensowny klucz”. """ shifted = { v: k for k, v in self.key_value.items() if _is_key(v) and re.search(r"\d", k) } for k in shifted: self.key_value.pop(shifted[k], None) self.key_value.update(shifted) def _prune_garbage_keys(self): for k in list(self.key_value): v, low = self.key_value[k].strip(), k.lower() if ( not v or v == k or (len(v) < 3 and not re.search(r"\d", v)) or any(sw in low for sw in RAW_TEXT_STOPWORDS) ): self.key_value.pop(k) def _merge_sections(self): merged: dict[str, list[str]] = {} for s in self.raw_sections: merged.setdefault(s["section"], []).extend(s["lines"]) self.raw_sections = [ {"section": t, "lines": _unique(ls)} for t, ls in merged.items() ] # ------------------------- opis (string lub lista) --------------------- # def _extract_description(self): """ 1) Spróbujmy najpierw elegancko: po nagłówku „Opis …” zbieramy

" ) # --------------------------------------------------------------------------- # # Wrapper dla pipeline-u # # --------------------------------------------------------------------------- # def raw_data_cleaner(html: str) -> dict: """ Owijka pod użycie w Playwright-owym pipeline-ie. * description może być albo stringiem, albo listą łańcuchów – zależnie od tego, czy w HTML-u użyto

. """ parser = UniversalListingParser(html) flat_txt = "\n".join( ln for sec in parser.raw_sections for ln in sec["lines"] ) parse_data = { "description": parser.description, "status": parser.status, "available_from": parser.available_from, **parser.key_value, } # wyrzuć None-y parse_data = {k: v for k, v in parse_data.items() if v is not None} return { "raw_text": flat_txt, "image_links": parser.image_links, "parse_data": parse_data, }