# manual_agregator/run_parser.py
import json
import logging
import re
import traceback
from typing import Optional, Set, Iterable
from urllib.parse import urlparse
from django.db.models import Q


from django.db import transaction, models
from django.db.models import Q

from extractly.models import (
    NetworkMonitoredPage,
    AdsManual,
    SourceManual,
    ManualParserLog,
)
from manual_agregator.parser.utils import map_data_to_manual_model
from manual_agregator.parser import parse_manual_data
from manual_agregator.description_extrator.core.nlp_extractor import NLPEnhancedExtractor

logger = logging.getLogger(__name__)


def safe_json_loads(s: str) -> dict:
    if not s:
        return {}
    try:
        return json.loads(s)
    except json.JSONDecodeError:
        cleaned = re.sub(r"[\x00-\x1F\x7F]", "", s)
        try:
            return json.loads(cleaned or "{}")
        except json.JSONDecodeError:
            logger.warning("Failed to parse JSON even after cleaning.")
            return {}


def _normalize_rules_for_queryset(raw_rules):
    """
    Zwraca dict do filtrowania querysetu stron.
    Akceptuje:
      - None -> {}
      - str(JSON) -> jeśli dict, zwróć go; jeśli list -> zignoruj (to layout rules)
      - dict -> przefiltruj po polach modelu i zwróć
      - list/tuple -> zignoruj (to layout rules)
      - inne typy -> {}
    """
    if not raw_rules:
        return {}

    parsed = raw_rules
    if isinstance(raw_rules, str):
        try:
            parsed = json.loads(raw_rules)
        except Exception:
            return {}

    if isinstance(parsed, (list, tuple)):
        return {}

    if isinstance(parsed, dict):
        model_fields = {f.name for f in NetworkMonitoredPage._meta.fields}
        return {k: v for k, v in parsed.items() if k in model_fields}

    return {}







def _is_empty_value(val, include_zero: bool) -> bool:
    # Pustka dla stringów: None lub "" po strip
    if isinstance(val, str):
        return val.strip() == ""
    # Pustka dla None:
    if val is None:
        return True
    # Pustka dla liczb: None lub (opcjonalnie) 0
    if isinstance(val, (int, float)) and include_zero:
        return val == 0
    return False

def _matches_empty_policy(mapped: dict, fields: Iterable[str], require_all: bool, include_zero: bool) -> bool:
    checks = []
    for f in fields:
        key = f  # zakładamy, że mapowanie ma klucze: 'price','title','description','address'
        checks.append(_is_empty_value(mapped.get(key), include_zero))
    return all(checks) if require_all else any(checks)


def process_manual_queue(
    limit: int = 6000,
    name_filters=None,
    only_id: Optional[int] = None,
    dry_run: bool = False,
    enable: bool = True,
    force: bool = False,
    manual_id: Optional[int] = None,
    force_names: Optional[Set[str]] = None,
    only_active: bool = False,
    only_inactive: bool = False,
    # ⬇⬇ NOWE:
    only_empty_fields: Optional[Iterable[str]] = None,
    require_all_empty: bool = False,
    empty_include_zero: bool = False,
    # CLI override for description enrichment: True enables, False disables, None -> use DB setting
    use_description_scraper: Optional[bool] = None,
) -> int:
    """
    ... (docstring skrócony) ...
    only_empty_fields: iter pól z {'price','title','description','address'} — filtruj do pustych.
    require_all_empty: jeśli True, wszystkie podane pola muszą być puste (inaczej: wystarczy któreś).
    empty_include_zero: jeśli True, 0 uznajemy za puste dla price.
    """
    processed_total = 0
    # wybór manuali
    if manual_id:
        manuals = list(SourceManual.objects.filter(id=manual_id, enable=True))
        if not manuals:
            raise ValueError(f"No ManualDataSource found for id={manual_id}.")
    else:
        manuals = list(SourceManual.objects.filter(enable=True))
        if not manuals:
            raise ValueError("No active ManualDataSource to process.")

    print("\nManuals to process:")
    for m in manuals:
        print(f" - [{m.id}] {getattr(m, 'name', str(m))} (source_id={m.source_id})")
    print("")

    # znormalizuj zestaw nazw dla dopasowania (lower)
    force_names_norm: Optional[Set[str]] = (
        {n.lower() for n in force_names} if force_names else None
    )

    for idx, manual_obj in enumerate(manuals, 1):
        manual_name = str(getattr(manual_obj, "name", "") or "").strip()
        source_title = str(getattr(manual_obj.source, "title", "") or "").strip()
        source_name = str(getattr(manual_obj.source, "name", "") or "").strip()

        candidates_to_match = {
            manual_name.lower(),
            source_title.lower(),
            source_name.lower(),
        }

        forced_here = bool(force) or (
            force_names_norm is not None and any(n in candidates_to_match for n in force_names_norm)
        )

        print(
            f"\n>> START [{idx}/{len(manuals)}] Manual: [{manual_obj.id}] "
            f"{manual_name or manual_obj} (source_id={manual_obj.source_id}) | "
            f"FORCED={forced_here} (global={bool(force)}; by-name={bool(force_names_norm)})"
        )

        # --- Filtrowanie querysetu stron (NIE mylić z regułami layoutu w parserze) ---
        qs_rules = _normalize_rules_for_queryset(getattr(manual_obj, "rules", None))

        if forced_here:
            pages_qs = (
                NetworkMonitoredPage.objects.filter(source=manual_obj.source)
                .exclude(
                    Q(sliced_html__isnull=True) |
                    Q(sliced_html__exact="") |
                    Q(sliced_html__exact="{}") |
                    Q(sliced_html__exact="[]") |
                    Q(sliced_html__exact=" ") |
                    Q(html__isnull=True) |
                    Q(html__exact="") |
                    Q(html__exact="error") |
                    Q(html__exact="{}") |
                    Q(html__exact="[]") |
                    Q(html__exact=" ")
                )
                .order_by("id")
            )
        else:
            filter_args = dict(
                network_ad_manual__isnull=True,
                source=manual_obj.source,
                **qs_rules,
            )
            pages_qs = (
                NetworkMonitoredPage.objects.filter(**filter_args)
                .exclude(
                    Q(sliced_html__isnull=True) |
                    Q(sliced_html__exact="") |
                    Q(sliced_html__exact="{}") |
                    Q(sliced_html__exact="[]") |
                    Q(sliced_html__exact=" ") |
                    Q(html__isnull=True) |
                    Q(html__exact="") |
                    Q(html__exact="{}") |
                    Q(html__exact="[]") |
                    Q(html__exact=" ")
                )
                .order_by("id")
            )

        # is_active filtry
        if only_active:
            pages_qs = pages_qs.filter(is_active=True)
        elif only_inactive:
            pages_qs = pages_qs.filter(is_active=False)

        # ⬇⬇ PREFILTR po pustych polach (możliwy TYLKO gdy już istnieje powiązane ogłoszenie, czyli zwykle w trybie force)
        if only_empty_fields and forced_here:
            def q_empty_for(field: str) -> Q:
                f = field.lower().strip()
                if f == 'price':
                    q = Q(network_ad_manual__price__isnull=True)
                    if empty_include_zero:
                        q |= Q(network_ad_manual__price=0)
                    return q
                elif f in ('title', 'description', 'address'):
                    return Q(**{f'network_ad_manual__{f}__isnull': True}) | Q(**{f'network_ad_manual__{f}__exact': ''})
                else:
                    return Q(pk__in=[])  # nieznane pole => nic
            qs_parts = [q_empty_for(f) for f in only_empty_fields]
            if require_all_empty:
                q_total = Q()
                # AND wszystkich części
                for part in qs_parts:
                    q_total &= part if q_total else part
            else:
                q_total = Q()
                for part in qs_parts:
                    q_total |= part
            pages_qs = pages_qs.filter(q_total)

        if only_id:
            pages_qs = pages_qs.filter(id=only_id)
        if name_filters:
            pages_qs = pages_qs.filter(name__icontains=name_filters)

        total_to_process = pages_qs.count()
        print(
            f"Manual: [{manual_obj.id}] {getattr(manual_obj, 'name', manual_obj)} – "
            f"total to process: {total_to_process} listings (limit: {limit})"
        )

        pages = pages_qs[:limit]
        processed_count = 0

        # Lazy-init extractor per manual to avoid repeated construction
        extractor = None

        def _get_setting_bool(manual: SourceManual, key: str, default: bool = False) -> bool:
            try:
                selectors = getattr(manual, "selectors", None) or {}
                if isinstance(selectors, str):
                    selectors = json.loads(selectors)
                settings = selectors.get("settings") or {}
                val = settings.get(key, default)
                return bool(val)
            except Exception:
                return default

        def _should_use_description_scraper(manual: SourceManual) -> bool:
            # CLI override takes precedence; if None, fall back to JSON toggle under selectors.settings.use_description_scraper
            if use_description_scraper is not None:
                return bool(use_description_scraper)
            return _get_setting_bool(manual, "use_description_scraper", False)

        def _merge_from_description_if_enabled(ad: AdsManual, manual: SourceManual) -> None:
            nonlocal extractor
            try:
                if not _should_use_description_scraper(manual):
                    return
                desc = getattr(ad, "description", None)
                if not desc:
                    return
                # Build extractor lazily once per manual
                if extractor is None:
                    extractor = NLPEnhancedExtractor()
                results = extractor.extract_all(desc)
                if not results:
                    return

                # Determine which fields to update (only empty/null ones)
                update_payload = {}
                updated_by_description = []

                # Create a mapping from field name to Django field instance
                model_fields = {f.name: f for f in ad._meta.get_fields() if hasattr(f, "attname")}

                def _is_empty_field(val, field):
                    from django.db import models as djm
                    if isinstance(field, (djm.CharField, djm.TextField)):
                        return val is None or (isinstance(val, str) and val.strip() == "")
                    # For numeric/bool/date fields: only treat None as empty
                    return val is None

                def _coerce_value_for_field(value, field):
                    from django.db import models as djm
                    from decimal import Decimal, InvalidOperation
                    from datetime import datetime, date
                    # Char/Text
                    if isinstance(field, (djm.CharField, djm.TextField)):
                        if value is None:
                            return None
                        # append marker '*' to indicate description-based enrichment
                        s = str(value)
                        try:
                            max_len = getattr(field, "max_length", None)
                        except Exception:
                            max_len = None
                        if not s.endswith("*"):
                            marker = "*"
                            if isinstance(max_len, int) and max_len:
                                if len(s) + 1 > max_len:
                                    if max_len >= 1:
                                        s = (s[: max_len - 1] if max_len > 1 else "") + marker
                                else:
                                    s = s + marker
                            else:
                                s = s + marker
                        return s
                    # Boolean
                    if isinstance(field, djm.BooleanField):
                        if value in (True, False):
                            return value
                        # Try to coerce from common truthy/falsey strings
                        if isinstance(value, str):
                            v = value.strip().lower()
                            if v in ("true", "1", "tak", "yes"):
                                return True
                            if v in ("false", "0", "nie", "no"):
                                return False
                        return None
                    # Integers
                    if isinstance(field, djm.IntegerField) and not isinstance(field, djm.BooleanField):
                        try:
                            return None if value is None else int(value)
                        except Exception:
                            return None
                    # Decimal/Float
                    if isinstance(field, (djm.DecimalField, djm.FloatField)):
                        if value is None:
                            return None
                        if isinstance(value, Decimal):
                            return value
                        try:
                            s = str(value).replace("\xa0", " ").replace(" ", "").replace(",", ".")
                            return Decimal(s)
                        except (InvalidOperation, ValueError, TypeError):
                            try:
                                return float(value)
                            except Exception:
                                return None
                    # Date
                    if isinstance(field, djm.DateField):
                        if value is None:
                            return None
                        if isinstance(value, (datetime, date)):
                            return value if isinstance(value, date) else value.date()
                        s = str(value).strip()
                        # Basic ISO format support
                        for fmt in ("%Y-%m-%d", "%d.%m.%Y", "%d/%m/%Y", "%d-%m-%Y"):
                            try:
                                return datetime.strptime(s, fmt).date()
                            except Exception:
                                continue
                        return None
                    # Fallback: leave as-is
                    return value

                for key, raw_val in results.items():
                    field = model_fields.get(key)
                    if not field:
                        continue
                    current_val = getattr(ad, key, None)
                    if not _is_empty_field(current_val, field):
                        continue
                    coerced = _coerce_value_for_field(raw_val, field)
                    if coerced is None:
                        continue
                    update_payload[key] = coerced
                    updated_by_description.append(key)

                if update_payload:
                    for k, v in update_payload.items():
                        setattr(ad, k, v)
                    # set provenance column with comma-separated field names; keep null if none
                    if updated_by_description:
                        setattr(ad, "description_scraped_variables", ",".join(updated_by_description))
                        update_fields_final = list(update_payload.keys()) + ["description_scraped_variables"]
                    else:
                        update_fields_final = list(update_payload.keys())
                    ad.save(update_fields=update_fields_final)
            except Exception as _e:
                # Don't break main flow due to enrichment issues
                logger.warning(f"Description enrichment skipped due to error: {_e}")

        for i, page in enumerate(pages, 1):
            try:
                print(f"  → [{i}/{min(total_to_process, limit)}] Processing listing: {page.url}")
                ok = parse_manual_data(page, strict=False)
                if not ok:
                    logger.warning(f"Parser returned False for {page.url}")
                    ManualParserLog.objects.create(
                        network_page=page,
                        source=manual_obj.source,
                        source_name=manual_obj.name,
                        domain=urlparse(page.url).netloc,
                        url=page.url,
                        skip_type='error',
                        error_message="parse_manual_data returned False - parsing failed",
                        error_summary="Parsing failed",
                        parsed_successfully=False,
                        saved_to_ads_manual=False
                    )
                    continue

                parsed_data = page.parse_data or safe_json_loads(page.raw_data or "{}")
                if not parsed_data:
                    logger.warning(f"No data in raw_data/parse_data for {page.url}")
                    ManualParserLog.objects.create(
                        network_page=page,
                        source=manual_obj.source,
                        source_name=manual_obj.name,
                        domain=urlparse(page.url).netloc,
                        url=page.url,
                        skip_type='no_data',
                        error_message="No data in raw_data/parse_data after parsing",
                        error_summary="Empty parsed data",
                        parsed_successfully=True,
                        saved_to_ads_manual=False
                    )
                    continue

                mapped = map_data_to_manual_model(parsed_data, page)

                # ⬇⬇ OSTATECZNY WARUNEK: przetwarzaj tylko, jeśli spełnia politykę "pustych pól"
                if only_empty_fields:
                    if not _matches_empty_policy(mapped, only_empty_fields, require_all_empty, empty_include_zero):
                        # pomijamy rekord – nie spełnia kryterium „pustych pól"
                        ManualParserLog.objects.create(
                            network_page=page,
                            source=manual_obj.source,
                            source_name=manual_obj.name,
                            domain=urlparse(page.url).netloc,
                            url=page.url,
                            skip_type='intentional',
                            error_message=f"Skipped by empty fields policy: {only_empty_fields}",
                            error_summary="Filtered by empty fields policy",
                            parsed_successfully=True,
                            saved_to_ads_manual=False
                        )
                        continue

                if dry_run:
                    logger.info(f"[DRY RUN] Ready to save: {page.url} | filtered by empty={only_empty_fields}")
                    continue

                with transaction.atomic():
                    if forced_here and page.network_ad_manual_id:
                        try:
                            page.network_ad_manual.delete()
                        except Exception:
                            pass
                        page.network_ad_manual = None
                        page.save(update_fields=["network_ad_manual"])

                    ad = AdsManual.objects.create(**mapped)
                    page.network_ad_manual = ad
                    page.save(update_fields=["network_ad_manual"])

                # Optional second-stage enrichment from description (fills only empty fields)
                _merge_from_description_if_enabled(ad, manual_obj)

                processed_count += 1
                logger.info(f"Saved listing: {page.url}")

            except Exception as e:
                issues = _guess_offending_fields(AdsManual, mapped if 'mapped' in locals() else {})
                error_trace = traceback.format_exc()
                
                if issues:
                    details = " | ".join(f"{fld}: {reason} (value='{preview}')" for fld, reason, preview in issues)
                    logger.error(f"Error for {page.url}: {e} | fields -> {details}")
                    
                    ManualParserLog.objects.create(
                        network_page=page,
                        source=manual_obj.source,
                        source_name=manual_obj.name,
                        domain=urlparse(page.url).netloc,
                        url=page.url,
                        skip_type='validation',
                        error_message=f"{str(e)}\n\n{error_trace}",
                        error_summary=f"{str(e)[:500]}",
                        field_errors=[{"field": fld, "reason": reason, "preview": preview} for fld, reason, preview in issues],
                        parsed_successfully=True,
                        saved_to_ads_manual=False
                    )
                else:
                    logger.error(f"Error for {page.url}: {e}")
                    
                    ManualParserLog.objects.create(
                        network_page=page,
                        source=manual_obj.source,
                        source_name=manual_obj.name,
                        domain=urlparse(page.url).netloc,
                        url=page.url,
                        skip_type='error',
                        error_message=f"{str(e)}\n\n{error_trace}",
                        error_summary=f"{str(e)[:500]}",
                        parsed_successfully='mapped' in locals(),
                        saved_to_ads_manual=False
                    )

        print(f"\n<< END of Manual: [{manual_obj.id}] processed {processed_count}/{min(total_to_process, limit)} listings.")
        processed_total += processed_count

    print(f"\nSUMMARY: processed {processed_total} listings for {len(manuals)} manual(s).\n")
    return processed_total


















def _guess_offending_fields(model_cls, payload: dict):
    """
    Zwraca listę (field, reason, preview) dla oczywistych problemów:
    - Char/URL: długość > max_length
    - Boolean: wartość inna niż True/False/None
    - Integer/Float: nie da się zrzutować
    """
    issues = []
    for f in model_cls._meta.get_fields():
        # pomijamy relacje odwrotne, itp.
        name = getattr(f, "attname", None)
        if not name or name not in payload:
            continue
        val = payload[name]

        try:
            # Char/URL
            if isinstance(f, (models.CharField, models.URLField)):
                max_len = getattr(f, "max_length", None)
                if val is not None and max_len:
                    s = str(val)
                    if len(s) > max_len:
                        issues.append((name, f"too long ({len(s)}/{max_len})", s[:120] + ("…" if len(s) > 120 else "")))

            # Boolean
            elif isinstance(f, models.BooleanField):
                if val not in (True, False, None):
                    issues.append((name, f"invalid boolean '{val}'", str(val)))

            # Integer
            elif isinstance(f, models.IntegerField) and not isinstance(f, models.BooleanField):
                if val not in (None, ""):
                    try:
                        int(val)
                    except Exception:
                        issues.append((name, f"invalid integer '{val}'", str(val)))

            # Float/Decimal
            elif isinstance(f, (models.FloatField, models.DecimalField)):
                if val not in (None, ""):
                    try:
                        float(val)
                    except Exception:
                        issues.append((name, f"invalid float/decimal '{val}'", str(val)))

        except Exception:
            # nie blokujemy logowania jeśli cokolwiek pójdzie nie tak
            pass

    return issues
