# import json # import traceback # import re # from bs4 import BeautifulSoup # from django.utils.timezone import now # from extractly.models import NetworkSourceError, NetworkMonitoredPage, ManualDataSource # from manual_agregator.notifications import send_alert_notification # def _resolve_missing_bool(config): # option = config.get("ifMissing", "null") # if option == "true": # return True # elif option == "false": # return False # return None # def _resolve_missing_text(config): # if config.get("ifMissing") == "default": # return config.get("defaultValue", "") # return None # def parse_currency(label_or_text): # """Standardizes and detects currency code (PLN, EUR, USD, GBP, etc).""" # if not label_or_text: # return None # text = label_or_text.strip().lower().replace(" ", "") # CURRENCY_MAP = { # # PLN # "zł": "PLN", "pln": "PLN", "zl": "PLN", "zł.": "PLN", # # EUR # "eur": "EUR", "euro": "EUR", "€": "EUR", # # USD # "usd": "USD", "dolar": "USD", "$": "USD", "us$": "USD", # # GBP # "gbp": "GBP", "funt": "GBP", "£": "GBP", # # ...extend as needed # } # for k, v in CURRENCY_MAP.items(): # if k in text: # return v # return None # def extract_number_and_label(text): # """Extracts number, label (suffix), and currency code from a text.""" # if not text: # return None, None, None # text = text.replace('\xa0', ' ').replace(',', '.') # match = re.search(r'([\d\s]+(?:\.\d+)?)', text) # number = None # label = None # currency = None # if match: # number_str = match.group(1).replace(" ", "") # try: # number = float(number_str) if "." in number_str else int(number_str) # except Exception: # number = None # label = text[match.end():].strip() # # Poprawka: ZAWSZE puszczamy label przez parse_currency! # currency = parse_currency(label) or parse_currency(text) # return number, label, currency # def extract_value_by_label(soup, label): # for div in soup.find_all("div", attrs={"data-sentry-element": "ItemGridContainer"}): # ps = div.find_all("p") # if len(ps) >= 2 and label in ps[0].text: # return ps[1].text.strip() # return None # def _check_inactive(page, soup, inactive_rules: list) -> bool: # for rule in inactive_rules: # rule_type = rule.get("type") # if rule_type == "text_contains": # text = rule.get("text", "").lower() # if text in page.html.lower(): # return True # elif rule_type == "selector_text": # selector = rule.get("selector") # expected_text = rule.get("text", "").lower() # element = soup.select_one(selector) # if element and element.get_text(strip=True).lower() == expected_text: # return True # elif rule_type == "selector_contains": # selector = rule.get("selector") # expected_text = rule.get("text", "").lower() # element = soup.select_one(selector) # if element and expected_text in element.get_text(strip=True).lower(): # return True # elif rule_type == "selector_missing": # selector = rule.get("selector") # if not soup.select_one(selector): # return True # elif rule_type == "source_field_match": # field = rule.get("field") # match = rule.get("match", "").lower() # value = getattr(page, field, "") # if isinstance(value, dict): # value = json.dumps(value) # if value and match in str(value).lower(): # return True # return False # def extract_image_links(soup, selector=None): # """ # Zwraca listę unikalnych linków do zdjęć (src z , srcset z ) wg selektora lub domyślnie cała strona. # """ # urls = set() # context = soup.select(selector) if selector else [soup] # for scope in context: # # Wszystkie # for img in scope.find_all("img"): # src = img.get("src") # if src and src.startswith("http"): # urls.add(src) # # Wszystkie (np. w