from django.core.management.base import BaseCommand from bs4 import BeautifulSoup from manual_agregator.parser.inactive import is_inactive from extractly.models import SourceNetwork import requests SAMPLES = { "banner": """

Ogłoszenie wygasło

""", "text_only": """

Ta oferta została usunięta. Ogłoszenie jest już niedostępne.

""", } class Command(BaseCommand): help = "Test inactive rules for a given SourceNetwork.name using synthetic HTML and optional file." def add_arguments(self, parser): parser.add_argument("--source-name", default="Otodom", help="SourceNetwork.name to test (default: Otodom)") parser.add_argument("--sample", choices=["banner", "text_only"], default="banner", help="Which built-in sample to test") parser.add_argument("--html-file", help="Optional path to an HTML file to test instead of a sample") parser.add_argument("--url", help="Optional URL string to test and to set page.url") parser.add_argument("--fetch", action="store_true", help="If set, fetch HTML from --url and test against it") def handle(self, *args, **options): name = options.get("source_name") or options.get("source-name") or "Otodom" sample = options.get("sample") html_file = options.get("html_file") test_url = options.get("url") or "https://otodom.pl/pl/oferta/przyklad" src = SourceNetwork.objects.filter(name__iexact=name).first() if not src or not getattr(src, "manual_data_source_fetcher", None): self.stderr.write(self.style.ERROR(f"Source '{name}' not found or has no SourceManual.")) return manual = src.manual_data_source_fetcher rules = manual.inactive or [] if options.get("fetch"): if not test_url: self.stderr.write(self.style.ERROR("--fetch requires --url")) return try: resp = requests.get(test_url, timeout=20, headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0 Safari/537.36" }) # Do not raise on HTTP errors; use whatever body is available (e.g., 410/404 pages) html = resp.text or "" except Exception as e: self.stderr.write(self.style.ERROR(f"Failed to fetch URL: {e}")) return elif html_file: try: with open(html_file, "r", encoding="utf-8") as f: html = f.read() except Exception as e: self.stderr.write(self.style.ERROR(f"Failed to read file: {e}")) return else: html = SAMPLES.get(sample, SAMPLES["banner"]).strip() soup = BeautifulSoup(html, "html.parser") # minimal page stub class _Page: url = test_url page = _Page() inactive, reason = is_inactive(html=html, soup=soup, rules=rules, page=page, extracted={}) self.stdout.write(f"inactive={inactive}, reason={reason}")