#!/usr/bin/env python3
"""
Parallel runner for manual_parser.

This script spawns multiple concurrent processes of:
  python manage.py manual_parser --id <PAGE_ID> [--dry-run] [--force]

Why per-ID? It guarantees no overlap between workers and plays nicely with the
existing command without modifying it. Using --name with multiple workers risks
races because each process will fetch the same pool of unprocessed pages.

Usage examples (from project root):
  # Process up to 200 Dobry listings with 6 workers
  python scripts/parallel_manual_parser.py --names dobry --max-pages 200 --workers 6

  # Process two sources together with 8 workers
  python scripts/parallel_manual_parser.py --names dobry,otodom --workers 8 --max-pages 500

  # Dry run (no DB writes), 4 workers
  python scripts/parallel_manual_parser.py --names dobry --workers 4 --dry-run

Options:
  --names         Comma-separated manual names (SourceManual.name), e.g. "dobry,otodom"
  --workers       Max concurrent processes (default: 4)
  --max-pages     Upper bound of pages to process in total across all names
  --per-name      Per-name cap; if set, each name will contribute at most this many pages
  --force         Pass --force to underlying manual_parser (be cautious)
  --dry-run       Pass --dry-run to underlying manual_parser
  --refresh       Recompute the ID list and continue until queue is empty (loop)
  --python        Explicit python executable to use; defaults to current interpreter

This script requires Django settings to be importable (NetworkMonitoring.settings).
"""
from __future__ import annotations

import argparse
import os
import sys
import time
from typing import Iterable, List, Sequence, Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed
import subprocess

# --- Django setup ---
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "NetworkMonitoring.settings")
import django  # type: ignore

django.setup()

from django.db.models import Q
from extractly.models import SourceManual, NetworkMonitoredPage


def _page_queryset_for_source(source_id):
    # Match the selection logic in run_parser for non-force mode
    qs = (
        NetworkMonitoredPage.objects.filter(
            source_id=source_id,
            network_ad_manual__isnull=True,
        )
        .exclude(
            Q(sliced_html__isnull=True)
            | Q(sliced_html__exact="")
            | Q(sliced_html__exact="{}")
            | Q(sliced_html__exact="[]")
            | Q(sliced_html__exact=" ")
            | Q(html__isnull=True)
            | Q(html__exact="")
            | Q(html__exact="error")
            | Q(html__exact="{}")
            | Q(html__exact="[]")
            | Q(html__exact=" ")
        )
        .order_by("id")
    )
    return qs


def pick_ids(names: Sequence[str], per_name: int | None, total_limit: int | None) -> List[int]:
    ids: List[int] = []
    for name in names:
        manuals = list(SourceManual.objects.filter(name__iexact=name, enable=True))
        if not manuals:
            # lenient fallback: try contains on manual name or source title/name
            manuals = list(
                SourceManual.objects.filter(enable=True).filter(
                    Q(name__icontains=name)
                    | Q(source__title__icontains=name)
                    | Q(source__name__icontains=name)
                )
            )
        if not manuals:
            print(f"[WARN] No SourceManual found for name '{name}'. Skipping.")
            continue

        for m in manuals:
            q = _page_queryset_for_source(m.source_id)
            take = per_name if per_name else (total_limit - len(ids) if total_limit else None)
            if take is not None and take >= 0:
                ids.extend(list(q.values_list("id", flat=True)[: take]))
            else:
                ids.extend(list(q.values_list("id", flat=True)))

            if total_limit is not None and len(ids) >= total_limit:
                return ids[: total_limit]

    # de-duplicate while preserving order
    seen = set()
    uniq: List[int] = []
    for i in ids:
        if i not in seen:
            seen.add(i)
            uniq.append(i)
    if total_limit is not None:
        uniq = uniq[: total_limit]
    return uniq


def run_manual_parser_for_id(page_id: int, py_exe: str, dry_run: bool, force: bool) -> Tuple[int, int]:
    """Returns (page_id, return_code)."""
    cmd = [py_exe, "manage.py", "manual_parser", "--id", str(page_id)]
    if dry_run:
        cmd.append("--dry-run")
    if force:
        cmd.append("--force")

    # Inherit env; ensure unbuffered output to avoid deadlocks on Windows
    env = os.environ.copy()
    env.setdefault("PYTHONUNBUFFERED", "1")

    try:
        res = subprocess.run(cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        if res.returncode != 0:
            sys.stderr.write(f"[ERR] id={page_id} rc={res.returncode}: {res.stderr.strip()}\n")
        else:
            sys.stdout.write(f"[OK ] id={page_id}\n")
        return (page_id, res.returncode)
    except Exception as e:
        sys.stderr.write(f"[EXC] id={page_id}: {e}\n")
        return (page_id, 1)


def process_batch(ids: Sequence[int], workers: int, py_exe: str, dry_run: bool, force: bool) -> Tuple[int, int]:
    ok = 0
    total = 0
    with ThreadPoolExecutor(max_workers=workers) as pool:
        futs = [pool.submit(run_manual_parser_for_id, pid, py_exe, dry_run, force) for pid in ids]
        for fut in as_completed(futs):
            _pid, rc = fut.result()
            total += 1
            if rc == 0:
                ok += 1
    return ok, total


def main(argv: Sequence[str] | None = None) -> int:
    p = argparse.ArgumentParser(description="Parallel runner for manual_parser (per-ID safe mode)")
    p.add_argument("--names", required=True, help="Comma-separated manual names (e.g. 'dobry,otodom')")
    p.add_argument("--workers", type=int, default=4, help="Max concurrent processes")
    p.add_argument("--max-pages", type=int, default=None, help="Total max pages to process across all names")
    p.add_argument("--per-name", type=int, default=None, help="Per-name cap of pages to schedule")
    p.add_argument("--force", action="store_true", help="Pass --force to manual_parser (be careful)")
    p.add_argument("--dry-run", action="store_true", help="Pass --dry-run to manual_parser")
    p.add_argument("--refresh", action="store_true", help="Loop: refresh ID list and run until empty queue")
    p.add_argument("--python", default=sys.executable, help="Python executable to invoke manage.py")

    args = p.parse_args(argv)

    names = [n.strip() for n in args.names.split(",") if n and n.strip()]
    if not names:
        print("No valid names provided.")
        return 2

    total_ok = 0
    total_done = 0

    while True:
        ids = pick_ids(names, args.per_name, args.max_pages)
        # Remove already attempted ones when looping
        if args.refresh and total_done:
            # Re-pick to avoid duplicates: pick_ids queries live DB; already processed should be excluded.
            pass

        if not ids:
            print("No pages to process.\n")
            break

        print(f"Scheduling {len(ids)} page(s) with {args.workers} worker(s)...")
        ok, done = process_batch(ids, args.workers, args.python, args.dry_run, args.force)
        total_ok += ok
        total_done += done
        print(f"Batch complete: {ok}/{done} succeeded.")

        if not args.refresh:
            break
        # In refresh mode, short sleep and loop to pick new IDs left in queue
        time.sleep(1.0)

    print(f"\nAll done: {total_ok}/{total_done} successful.")
    # return non-zero if any failed to make it visible to CI
    return 0 if total_ok == total_done else 1


if __name__ == "__main__":
    raise SystemExit(main())