#manual_agregator/parser/utils.py import json, re from bs4 import BeautifulSoup from extractly.models import NetworkMonitoredPage, AdsManual def resolve_missing_bool(config): opt = config.get("ifMissing", "null") if opt == "true": return True if opt == "false": return False return None def resolve_missing_text(config): if config.get("ifMissing") == "default": return config.get("defaultValue", "") if config.get("ifMissing") == "null": return None return None def parse_currency(label_or_text): if not label_or_text: return None text = str(label_or_text).strip().lower().replace(" ", "") CURRENCY_MAP = { "zł":"PLN","pln":"PLN","zl":"PLN","zł.":"PLN", "eur":"EUR","euro":"EUR","€":"EUR", "usd":"USD","dolar":"USD","$":"USD","us$":"USD", "gbp":"GBP","funt":"GBP","£":"GBP", } for k, v in CURRENCY_MAP.items(): if k in text: return v return None def extract_number_and_label(text): if not text: return None, None, None t = str(text).replace('\xa0', ' ').replace(',', '.') m = re.search(r'([\d\s]+(?:\.\d+)?)', t) number = label = currency = None if m: number_str = m.group(1).replace(" ", "") try: number = float(number_str) if "." in number_str else int(number_str) except Exception: number = None label = t[m.end():].strip() currency = parse_currency(label) or parse_currency(t) return number, label, currency from bs4 import BeautifulSoup, Tag import re _CURRENCY_HINTS = ("zł", "pln", "€", "eur", "$", "usd", "£", "gbp") # utils.py import re from bs4 import BeautifulSoup, Tag _CURRENCY_HINTS = {"zł", "pln", "eur", "€", "$"} def _norm_label(s: str) -> str: if not s: return "" s = s.replace("\xa0", " ") s = re.sub(r"\s+", " ", s) s = s.strip() if s.endswith(":"): s = s[:-1].strip() return s.lower() def extract_value_by_label(soup: BeautifulSoup, config: dict): """ Kompatybilny ekstraktor etykieta→wartość. Obsługuje: - OtoDom:

ETYKIETA

WARTOŚĆ

- fallbacki: 'sibling'/'parent'/'next_tag' + valueTags Klucze config (tak jak dotychczas, wszystkie opcjonalne poza 'label'): - label: str - selector: str (zawęź obszar szukania) - altLabels: list[str] - caseInsensitive: bool (domyślnie True) - match: "equals" | "contains" (domyślnie "equals") - valueTags: list[str] (domyślnie ["p","span","div"]) - labelPosition: "sibling" | "parent" | "next_tag" (domyślnie "sibling") """ if not soup or not config or not config.get("label"): return None label = config.get("label") alt_labels = config.get("altLabels", []) or [] case_insensitive = config.get("caseInsensitive", True) match_type = config.get("match", "equals") value_tags = tuple(config.get("valueTags") or ("p", "span", "div")) value_classes = {c for c in config.get("valueClasses", []) if c} strategy = config.get("labelPosition", "sibling") root_selector = config.get("selector") allow_parent_fallback = config.get("parentFallback", True) def norm(s: str) -> str: s = (s or "").strip() return s.lower() if case_insensitive else s wanted = {_norm_label(label)} | {_norm_label(x) for x in alt_labels} def is_match(text: str) -> bool: t = _norm_label(text) if case_insensitive else text.strip().rstrip(":") if match_type == "contains": return any(w in t for w in wanted) return t in wanted roots = soup.select(root_selector) if root_selector else [soup] # --- 1) Tryb OtoDom: wiersze z ItemGridContainer (lub klasą css-1xw0jqp) for root in roots: rows = root.select('div[data-sentry-element="ItemGridContainer"], div.css-1xw0jqp') for row in rows: ps = row.find_all("p", recursive=True) if len(ps) < 2: continue lab = ps[0].get_text(" ", strip=True) if not is_match(lab): continue val_text = ps[1].get_text(" ", strip=True) if val_text: return val_text # --- 2) Fallback: Twoje dotychczasowe strategie def _good_value(text: str) -> bool: t = (text or "").strip() return bool(re.search(r"\d", t)) or any(h in t.lower() for h in _CURRENCY_HINTS) # Znajdź wszystkie potencjalne etykiety w wybranym obszarze label_nodes = [] for root in roots: for el in root.find_all(True): txt = el.get_text(" ", strip=True) if txt and is_match(txt): label_nodes.append(el) if not label_nodes: return None def _matches_value_class(tag: Tag) -> bool: if not value_classes: return True classes = tag.get("class") if not classes: return False if isinstance(classes, str): classes = classes.split() return any(cls in classes for cls in value_classes) for el in label_nodes: # strategy: sibling if strategy in ("sibling", "parent"): sib = el.find_next_sibling() while isinstance(sib, Tag): if sib.name in value_tags and _matches_value_class(sib): val = sib.get_text(" ", strip=True) if val: return val # icon-only boolean support: detect images with yes/no icons try: img = sib.find("img") if img and (img.get("src") or "").lower(): src = img.get("src").lower() if "ikona_tak" in src or "icon_yes" in src or "yes" in src: return "tak" if "ikona_nie" in src or "icon_no" in src or "no" in src: return "nie" except Exception: pass sib = sib.find_next_sibling() # strategy: parent (w górę max 3 poziomy, wybór najlepszego kandydata) if allow_parent_fallback: up = el for _ in range(3): up = up.parent if isinstance(up, Tag) else None if not isinstance(up, Tag): break candidates = [] for tag_name in value_tags: for cand in up.find_all(tag_name, recursive=True): if not _matches_value_class(cand): continue if cand is el: continue txt = cand.get_text(" ", strip=True) if not txt: continue score = (2 if _good_value(txt) else 1) # preferuj elementy po etykiecie (gdy dostępny sourceline) c_line = getattr(cand, "sourceline", 0) or 0 e_line = getattr(el, "sourceline", 0) or 0 if c_line >= e_line and (c_line or e_line): score += 1 candidates.append((score, txt)) if candidates: candidates.sort(key=lambda x: x[0], reverse=True) return candidates[0][1] # strategy: next_tag if strategy == "next_tag": nxt = el.find_next() if isinstance(nxt, Tag) and nxt.name in value_tags: val = nxt.get_text(" ", strip=True) if val: return val # As a final fallback for icon-only values: check immediate next value cell for img try: value_cell = el.find_next_sibling() # walk at most 3 siblings to find a value container hops = 0 while isinstance(value_cell, Tag) and hops < 3: img = value_cell.find("img") if img and (img.get("src") or "").lower(): src = img.get("src").lower() if "ikona_tak" in src or "icon_yes" in src or "yes" in src: return "tak" if "ikona_nie" in src or "icon_no" in src or "no" in src: return "nie" value_cell = value_cell.find_next_sibling() hops += 1 except Exception: pass return None # -*- coding: utf-8 -*- from collections import OrderedDict from urllib.parse import urljoin def _normalize_url(u: str) -> str: if not u: return "" u = u.strip() # protokół-względne -> https if u.startswith("//"): return "https:" + u return u def _split_srcset(srcset: str): """ Parsuje atrybut srcset i zwraca listę URL-i (bez descriptorów w/h/x). """ urls = [] if not srcset: return urls for part in srcset.split(","): cand = part.strip().split(" ")[0].strip() if cand: urls.append(cand) return urls def extract_image_links(soup, selector=None, base_url: str = None): """ Zwraca listę absolutnych URL-i obrazów, bez duplikatów, z zachowaniem kolejności. Wspiera: , lazy atrybuty, , meta og:image/twitter:image. Jeśli podasz `selector`, przeszukuje tylko wybrany fragment DOM-u. """ seen = OrderedDict() scopes = soup.select(selector) if selector else [soup] def add(url: str): url = _normalize_url(url) if not url: return # tylko http(s) if not (url.startswith("http://") or url.startswith("https://")): # ewentualnie próbujmy urljoin, jeśli jest base_url i ścieżka względna if base_url and (url.startswith("/") or not url.startswith("data:")): url = urljoin(base_url, url) else: return if url not in seen: seen[url] = True for scope in scopes: # 1) klasyczne for img in scope.find_all("img"): # zwykły src add(img.get("src")) # atrybuty lazy add(img.get("data-src")) add(img.get("data-original")) add(img.get("data-lazy")) # niektóre CDN-y używają data-srcset srcset = img.get("srcset") or img.get("data-srcset") if srcset: for u in _split_srcset(srcset): add(u) # 2) w