#manual_agregator/parser/utils.py

import json, re
from bs4 import BeautifulSoup
from extractly.models import NetworkMonitoredPage, AdsManual

def resolve_missing_bool(config):
    opt = config.get("ifMissing", "null")
    if opt == "true": return True
    if opt == "false": return False
    return None

def resolve_missing_text(config):
    if config.get("ifMissing") == "default":
        return config.get("defaultValue", "")
    if config.get("ifMissing") == "null":
        return None
    return None




def parse_currency(label_or_text):
    if not label_or_text:
        return None
    text = str(label_or_text).strip().lower().replace(" ", "")
    CURRENCY_MAP = {
        "zł":"PLN","pln":"PLN","zl":"PLN","zł.":"PLN",
        "eur":"EUR","euro":"EUR","€":"EUR",
        "usd":"USD","dolar":"USD","$":"USD","us$":"USD",
        "gbp":"GBP","funt":"GBP","£":"GBP",
    }
    for k, v in CURRENCY_MAP.items():
        if k in text:
            return v
    return None

def extract_number_and_label(text):
    if not text:
        return None, None, None
    t = str(text).replace('\xa0', ' ').replace(',', '.')
    m = re.search(r'([\d\s]+(?:\.\d+)?)', t)
    number = label = currency = None
    if m:
        number_str = m.group(1).replace(" ", "")
        try:
            number = float(number_str) if "." in number_str else int(number_str)
        except Exception:
            number = None
        label = t[m.end():].strip()
        currency = parse_currency(label) or parse_currency(t)
    return number, label, currency




from bs4 import BeautifulSoup, Tag
import re

_CURRENCY_HINTS = ("zł", "pln", "€", "eur", "$", "usd", "£", "gbp")
# utils.py
import re
from bs4 import BeautifulSoup, Tag

_CURRENCY_HINTS = {"zł", "pln", "eur", "€", "$"}

def _norm_label(s: str) -> str:
    if not s:
        return ""
    s = s.replace("\xa0", " ")
    s = re.sub(r"\s+", " ", s)
    s = s.strip()
    if s.endswith(":"):
        s = s[:-1].strip()
    return s.lower()

def extract_value_by_label(soup: BeautifulSoup, config: dict):
    """
    Kompatybilny ekstraktor etykieta→wartość.
    Obsługuje:
      - OtoDom: <div data-sentry-element="ItemGridContainer"><p>ETYKIETA</p><p>WARTOŚĆ</p></div>
      - fallbacki: 'sibling'/'parent'/'next_tag' + valueTags
    Klucze config (tak jak dotychczas, wszystkie opcjonalne poza 'label'):
      - label: str
      - selector: str (zawęź obszar szukania)
      - altLabels: list[str]
      - caseInsensitive: bool (domyślnie True)
      - match: "equals" | "contains" (domyślnie "equals")
      - valueTags: list[str] (domyślnie ["p","span","div"])
      - labelPosition: "sibling" | "parent" | "next_tag" (domyślnie "sibling")
    """
    if not soup or not config or not config.get("label"):
        return None

    label            = config.get("label")
    alt_labels       = config.get("altLabels", []) or []
    case_insensitive = config.get("caseInsensitive", True)
    match_type       = config.get("match", "equals")
    value_tags       = tuple(config.get("valueTags") or ("p", "span", "div"))
    value_classes    = {c for c in config.get("valueClasses", []) if c}
    strategy         = config.get("labelPosition", "sibling")
    root_selector    = config.get("selector")
    allow_parent_fallback = config.get("parentFallback", True)

    def norm(s: str) -> str:
        s = (s or "").strip()
        return s.lower() if case_insensitive else s

    wanted = {_norm_label(label)} | {_norm_label(x) for x in alt_labels}
    def is_match(text: str) -> bool:
        t = _norm_label(text) if case_insensitive else text.strip().rstrip(":")
        if match_type == "contains":
            return any(w in t for w in wanted)
        return t in wanted

    roots = soup.select(root_selector) if root_selector else [soup]

    # --- 1) Tryb OtoDom: wiersze z ItemGridContainer (lub klasą css-1xw0jqp)
    for root in roots:
        rows = root.select('div[data-sentry-element="ItemGridContainer"], div.css-1xw0jqp')
        for row in rows:
            ps = row.find_all("p", recursive=True)
            if len(ps) < 2:
                continue
            lab = ps[0].get_text(" ", strip=True)
            if not is_match(lab):
                continue
            val_text = ps[1].get_text(" ", strip=True)
            if val_text:
                return val_text

    # --- 2) Fallback: Twoje dotychczasowe strategie
    def _good_value(text: str) -> bool:
        t = (text or "").strip()
        return bool(re.search(r"\d", t)) or any(h in t.lower() for h in _CURRENCY_HINTS)

    # Znajdź wszystkie potencjalne etykiety w wybranym obszarze
    label_nodes = []
    for root in roots:
        for el in root.find_all(True):
            txt = el.get_text(" ", strip=True)
            if txt and is_match(txt):
                label_nodes.append(el)

    if not label_nodes:
        return None

    def _matches_value_class(tag: Tag) -> bool:
        if not value_classes:
            return True
        classes = tag.get("class")
        if not classes:
            return False
        if isinstance(classes, str):
            classes = classes.split()
        return any(cls in classes for cls in value_classes)

    for el in label_nodes:
        # strategy: sibling
        if strategy in ("sibling", "parent"):
            sib = el.find_next_sibling()
            while isinstance(sib, Tag):
                if sib.name in value_tags and _matches_value_class(sib):
                    val = sib.get_text(" ", strip=True)
                    if val:
                        return val
                    # icon-only boolean support: detect images with yes/no icons
                    try:
                        img = sib.find("img")
                        if img and (img.get("src") or "").lower():
                            src = img.get("src").lower()
                            if "ikona_tak" in src or "icon_yes" in src or "yes" in src:
                                return "tak"
                            if "ikona_nie" in src or "icon_no" in src or "no" in src:
                                return "nie"
                    except Exception:
                        pass
                sib = sib.find_next_sibling()

        # strategy: parent (w górę max 3 poziomy, wybór najlepszego kandydata)
        if allow_parent_fallback:
            up = el
            for _ in range(3):
                up = up.parent if isinstance(up, Tag) else None
                if not isinstance(up, Tag):
                    break
                candidates = []
                for tag_name in value_tags:
                    for cand in up.find_all(tag_name, recursive=True):
                        if not _matches_value_class(cand):
                            continue
                        if cand is el:
                            continue
                        txt = cand.get_text(" ", strip=True)
                        if not txt:
                            continue
                        score = (2 if _good_value(txt) else 1)
                        # preferuj elementy po etykiecie (gdy dostępny sourceline)
                        c_line = getattr(cand, "sourceline", 0) or 0
                        e_line = getattr(el, "sourceline", 0) or 0
                        if c_line >= e_line and (c_line or e_line):
                            score += 1
                        candidates.append((score, txt))
                if candidates:
                    candidates.sort(key=lambda x: x[0], reverse=True)
                    return candidates[0][1]

        # strategy: next_tag
        if strategy == "next_tag":
            nxt = el.find_next()
            if isinstance(nxt, Tag) and nxt.name in value_tags:
                val = nxt.get_text(" ", strip=True)
                if val:
                    return val

        # As a final fallback for icon-only values: check immediate next value cell for img
        try:
            value_cell = el.find_next_sibling()
            # walk at most 3 siblings to find a value container
            hops = 0
            while isinstance(value_cell, Tag) and hops < 3:
                img = value_cell.find("img")
                if img and (img.get("src") or "").lower():
                    src = img.get("src").lower()
                    if "ikona_tak" in src or "icon_yes" in src or "yes" in src:
                        return "tak"
                    if "ikona_nie" in src or "icon_no" in src or "no" in src:
                        return "nie"
                value_cell = value_cell.find_next_sibling()
                hops += 1
        except Exception:
            pass

    return None









# -*- coding: utf-8 -*-
from collections import OrderedDict
from urllib.parse import urljoin

def _normalize_url(u: str) -> str:
    if not u:
        return ""
    u = u.strip()
    # protokół-względne -> https
    if u.startswith("//"):
        return "https:" + u
    return u

def _split_srcset(srcset: str):
    """
    Parsuje atrybut srcset i zwraca listę URL-i (bez descriptorów w/h/x).
    """
    urls = []
    if not srcset:
        return urls
    for part in srcset.split(","):
        cand = part.strip().split(" ")[0].strip()
        if cand:
            urls.append(cand)
    return urls

def extract_image_links(soup, selector=None, base_url: str = None):
    """
    Zwraca listę absolutnych URL-i obrazów, bez duplikatów, z zachowaniem kolejności.
    Wspiera: <img src>, lazy atrybuty, <source srcset>, meta og:image/twitter:image.
    Jeśli podasz `selector`, przeszukuje tylko wybrany fragment DOM-u.
    """
    seen = OrderedDict()
    scopes = soup.select(selector) if selector else [soup]

    def add(url: str):
        url = _normalize_url(url)
        if not url:
            return
        # tylko http(s)
        if not (url.startswith("http://") or url.startswith("https://")):
            # ewentualnie próbujmy urljoin, jeśli jest base_url i ścieżka względna
            if base_url and (url.startswith("/") or not url.startswith("data:")):
                url = urljoin(base_url, url)
            else:
                return
        if url not in seen:
            seen[url] = True

    for scope in scopes:
        # 1) klasyczne <img>
        for img in scope.find_all("img"):
            # zwykły src
            add(img.get("src"))
            # atrybuty lazy
            add(img.get("data-src"))
            add(img.get("data-original"))
            add(img.get("data-lazy"))
            # niektóre CDN-y używają data-srcset
            srcset = img.get("srcset") or img.get("data-srcset")
            if srcset:
                for u in _split_srcset(srcset):
                    add(u)

        # 2) <source srcset> w <picture>
        for src in scope.find_all("source"):
            srcset = src.get("srcset") or src.get("data-srcset")
            if srcset:
                for u in _split_srcset(srcset):
                    add(u)

        # 3) noscript z <img> w środku
        for ns in scope.find_all("noscript"):
            # noscript często zawiera surowy HTML <img ...>
            # Parsujemy wewnętrzny HTML jeszcze raz
            from bs4 import BeautifulSoup
            inner = BeautifulSoup(ns.decode_contents(), "html.parser")
            for img in inner.find_all("img"):
                add(img.get("src"))
                add(img.get("data-src"))
                srcset = img.get("srcset") or img.get("data-srcset")
                if srcset:
                    for u in _split_srcset(srcset):
                        add(u)

    # 4) meta og:image / twitter:image (poza scope, bo to globalne)
    #    Dodajemy je na końcu jako fallback.
    og = soup.find("meta", attrs={"property": "og:image"})
    if og and og.get("content"):
        add(og["content"])
    tw = soup.find("meta", attrs={"name": "twitter:image"})
    if tw and tw.get("content"):
        add(tw["content"])

    return list(seen.keys())





def value_is_empty(v):
    if v is None: return True
    if isinstance(v, str) and v.strip() == "": return True
    if isinstance(v, (list, dict)) and not v: return True
    return False

def check_inactive(page, soup, inactive_rules: list) -> bool:
    for rule in inactive_rules or []:
        t = rule.get("type")
        if t == "text_contains":
            txt = (rule.get("text") or "").lower()
            if txt and txt in (page.html or "").lower():
                return True
        elif t == "selector_text":
            sel = rule.get("selector")
            expected = (rule.get("text") or "").lower()
            el = soup.select_one(sel) if sel else None
            if el and el.get_text(strip=True).lower() == expected:
                return True
        elif t == "selector_contains":
            sel = rule.get("selector")
            txt = (rule.get("text") or "").lower()
            el = soup.select_one(sel) if sel else None
            if el and txt in el.get_text(" ", strip=True).lower():
                return True
        elif t == "selector_missing":
            sel = rule.get("selector")
            if sel and not soup.select_one(sel):
                return True
        elif t == "source_field_match":
            field = rule.get("field")
            match = (rule.get("match") or "").lower()
            value = getattr(page, field, "")
            if isinstance(value, dict):
                value = json.dumps(value)
            if value and match in str(value).lower():
                return True
    return False

def is_field_config_dict(d: dict) -> bool:
    try:
        return any(
            isinstance(v, dict) and ('fieldType' in v or 'selectors' in v or 'fromMain' in v)
            for v in d.values()
        )
    except Exception:
        return False

# --------- normalizacje ---------

def normalize_rules(rules):
    """
    Zwraca listę reguł (list[dict]) albo pustą listę.
    Akceptuje: None, str(JSON), dict, list/tuple.
    Inne typy -> [].
    """
    if not rules:
        return []
    try:
        if isinstance(rules, str):
            rules = json.loads(rules)
    except Exception:
        return []
    if isinstance(rules, dict):
        return [rules]
    if isinstance(rules, (list, tuple)):
        return list(rules)
    return []

def normalize_selectors(selectors):
    """
    Zwraca dict z selektorami (gałęzie lub płaska mapa pól).
    Akceptuje: None -> {}, str(JSON) -> dict lub {},
               dict -> dict, inne -> {}.
    """
    if not selectors:
        return {}
    if isinstance(selectors, str):
        try:
            sel = json.loads(selectors)
            return sel if isinstance(sel, dict) else {}
        except Exception:
            return {}
    return selectors if isinstance(selectors, dict) else {}

# --------- wybór gałęzi ---------

def resolve_selectors(manual_config, soup):
    """
    Zwraca ZAWSZE krotkę: (selectors_dict, selected_type|None).
    Obsługuje:
      - reguły starego typu: {'selector','match','type'} / {'source','match','type'}
      - 'default' gałąź
      - płaską mapę pól
      - jedyną gałąź
    """
    selected_type = None
    rules = normalize_rules(getattr(manual_config, "rules", None))
    all_sel = normalize_selectors(getattr(manual_config, "selectors", None))

    # stary styl reguł (selector+match => type)
    for rule in rules:
        if not isinstance(rule, dict):
            continue
        if "selector" in rule and "type" in rule:
            sel = rule.get("selector")
            el = soup.select_one(sel) if sel else None
            if el:
                content = el.get_text(strip=True).lower()
                mv = rule.get("match", [])
                if isinstance(mv, str): mv = [mv]
                if any(v.lower() in content for v in mv):
                    selected_type = rule.get("type"); break
        elif "source" in rule and "type" in rule:
            # podtrzymane dla kompatybilności
            pass

    if selected_type and selected_type in all_sel and isinstance(all_sel[selected_type], dict):
        return all_sel[selected_type], selected_type

    if "default" in all_sel and isinstance(all_sel["default"], dict):
        return all_sel["default"], "default"

    if is_field_config_dict(all_sel):
        return all_sel, "flat"

    if len(all_sel) == 1:
        k, v = next(iter(all_sel.items()))
        if isinstance(v, dict):
            return v, k

    # brak używalnych selektorów
    return {}, None

# --------- reguły dynamiczne (po 1. parsie) ---------

def apply_dynamic_rules(rules, soup, extracted):
    """
    Nowy styl:
      { "when": { "all": [...], "any": [...] }, "type": "otodom_v2" }
    Zwraca nazwę gałęzi (type) lub None.
    """
    def cond_ok(cond: dict) -> bool:
        if "field_empty" in cond:
            return value_is_empty(extracted.get(cond["field_empty"]))
        if "field_missing" in cond:
            return extracted.get(cond["field_missing"]) is None
        if "selector_exists" in cond:
            sel = cond.get("selector_exists")
            return bool(sel and soup.select_one(sel))
        if "selector_missing" in cond:
            sel = cond.get("selector_missing")
            return bool(sel) and (soup.select_one(sel) is None)
        if "selector_contains" in cond:
            sel = cond["selector_contains"].get("selector")
            txt = (cond["selector_contains"].get("text") or "").lower()
            el = soup.select_one(sel) if sel else None
            return bool(el and txt in el.get_text(" ", strip=True).lower())
        return False

    rules_list = normalize_rules(rules)
    for rule in rules_list:
        if not isinstance(rule, dict):
            continue
        when = rule.get("when")
        if not isinstance(when, dict):
            continue
        all_ok = all(cond_ok(c) for c in when.get("all", [])) if "all" in when else True
        any_list = when.get("any", [])
        any_ok = any(cond_ok(c) for c in any_list) if any_list else True
        if all_ok and any_ok:
            return rule.get("type")
    return None



# manual_agregator/parser/utils.py

def map_data_to_manual_model(data: dict, page: NetworkMonitoredPage) -> dict:
    allowed = {
        f.name
        for f in AdsManual._meta.fields
        if f.name not in ["id", "created_at"]
    }

    mapped = {k: v for k, v in (data or {}).items() if k in allowed}

    # status aktywności i reason
    if "is_active" in allowed:
        mapped["is_active"] = (data or {}).get("is_active", page.is_active)

    if "inactive_reason" in allowed:
        reason = (data or {}).get("inactive_reason")
        if reason is None:
            meta = getattr(page, "meta", {}) or {}
            reason = meta.get("inactive_reason")
        mapped["inactive_reason"] = reason

    # UWAGA: nie nadpisujemy wartości z data, tylko uzupełniamy braki.
    # (USUŃ "original_image_urls" z tej listy, bo już masz je w data)
    for key in [
        "image_links",
        "source",
        "meta",
        "inactive_date",
        "date_fetched",
        "created_at",
        "name",
        "estate_type",
        "offer_type",
        # "original_image_urls",  # <- NIE NADPISUJEMY!
    ]:
        if key in allowed and (mapped.get(key) is None):
            mapped[key] = getattr(page, key, None)

    if "url" in allowed and "url" not in mapped:
        mapped["url"] = page.url

    # DODATKOWE ZABEZP.: jeśli model ma pole original_image_urls i w data jest lista,
    # upewnijmy się, że zostaje zachowana
    if "original_image_urls" in allowed and "original_image_urls" in (data or {}):
        mapped["original_image_urls"] = data["original_image_urls"]

    # --- Post-process fallbacks ---
    # price_per_m2: if not provided by parser, compute using price and square_footage
    try:
        pp2 = mapped.get("price_per_m2", None)
        # consider empty string as missing as well
        is_missing_pp2 = (pp2 is None) or (isinstance(pp2, str) and not pp2.strip())
        if is_missing_pp2:
            price_val = mapped.get("price", (data or {}).get("price"))
            area_val = mapped.get("square_footage", (data or {}).get("square_footage"))

            def _to_float(v):
                if v is None:
                    return None
                if isinstance(v, (int, float)):
                    return float(v)
                try:
                    s = str(v).replace("\xa0", " ").replace(" ", "").replace(",", ".")
                    return float(s)
                except Exception:
                    return None

            p = _to_float(price_val)
            a = _to_float(area_val)
            if p is not None and a is not None and a > 0:
                # store as integer PLN per m2 (rounded)
                computed = int(round(p / a))
                if "price_per_m2" in allowed:
                    mapped["price_per_m2"] = computed
            else:
                if "price_per_m2" in allowed and ("price_per_m2" not in mapped):
                    mapped["price_per_m2"] = None
    except Exception:
        # never break mapping due to fallback calc errors
        pass

    # --- Post-process: normalize area_unit into allowed choices ('m2','ha','ar','ft2','sqft') ---
    try:
        if "area_unit" in allowed:
            raw_u = mapped.get("area_unit", (data or {}).get("area_unit"))
            def _normalize_unit(val: any) -> str:
                if val is None:
                    return None
                s = str(val).strip().lower()
                # Quick fixes for common variants
                s = s.replace("m²", "m2").replace("m^2", "m2").replace("m kw", "m2").replace("mkw", "m2").replace("m.2", "m2")
                # Extract a known token if embedded in longer text
                import re
                m = re.search(r"\b(m2|ha|ar|ft2|sqft)\b", s)
                if m:
                    return m.group(1)
                # If contains 'm' and a 2-like suffix, assume m2
                if "m" in s and ("2" in s or "²" in s):
                    return "m2"
                if "hektar" in s:
                    return "ha"
                if "ar" in s:
                    return "ar"
                return None
            norm = _normalize_unit(raw_u)
            if norm is None:
                # Default sensibly to m2 when unit cannot be parsed
                norm = "m2"
            mapped["area_unit"] = norm
    except Exception:
        # unit normalization is best-effort
        pass

    # --- Post-process: coerce invalid/placeholder dates to None and parse valid dates ---
    # Django DateField expects Python date objects; we transform common string formats.
    try:
        from datetime import datetime, date

        def _clean_date_val(val):
            if val is None:
                return None
            # Accept already-correct date objects
            if isinstance(val, date) and not isinstance(val, datetime):
                return val
            s = str(val).strip()
            if not s:
                return None
            # normalize quotes and case
            s_norm = s.replace("\u201e", '"').replace("\u201d", '"').replace("\u2019", "'").lower()
            # placeholders / unknowns
            placeholders = {"brak informacji", "brak", "—", "-", "n/d", "nie podano", "nie dotyczy", "n/a"}
            if s_norm in placeholders:
                return None
            # try multiple formats
            fmts = [
                "%Y-%m-%d",  # 2025-09-01
                "%d.%m.%Y",  # 01.09.2025
                "%d/%m/%Y",  # 01/09/2025
                "%d-%m-%Y",  # 01-09-2025
            ]
            for fmt in fmts:
                try:
                    return datetime.strptime(s, fmt).date()
                except Exception:
                    continue
            # If nothing matched, return None to avoid validation error
            return None

        for df in ("available_from", "listing_date"):
            if df in mapped:
                mapped[df] = _clean_date_val(mapped.get(df))
    except Exception:
        # don't let date cleanup break the mapping
        pass

    return mapped
