#!/usr/bin/env python3
"""
parallel_manual_parser_v2.py

Run many instances of:
  python manage.py manual_parser --id <PAGE_ID>
concurrently, so large backlogs finish much faster without overlapping work.

Why per-ID? Passing a single page ID to each process ensures no two workers
race on the same listing. This plays nicely with your existing management
command and database constraints, with zero edits to current code.

Highlights
- Supports one or multiple manuals via --name (comma separated)
- Mirrors selection logic from your runner (forced vs non-forced queues)
- Allows scoping by --max, --since-id, --until-id, and page name filter
- Spawns N concurrent subprocesses; summarizes successes/failures
- Windows-friendly (PowerShell), uses current Python interpreter by default

Examples (PowerShell from repository root):
  # 8 workers, first 1000 pages from Dobry
  python scripts/parallel_manual_parser_v2.py --name dobry --workers 8 --max 1000

  # 6 workers, Dobry + Otodom, process up to 3000 pages total
  python scripts/parallel_manual_parser_v2.py --name dobry,otodom --workers 6 --max 3000

  # Dry run (no DB writes inside manage.py), 4 workers
  python scripts/parallel_manual_parser_v2.py --name dobry --workers 4 --dry-run

  # Restrict to a page id window
  python scripts/parallel_manual_parser_v2.py --name dobry --since-id 200000 --until-id 250000 --workers 8

Notes
- This script does NOT modify any existing code. It only orchestrates via subprocesses.
- It initializes Django solely to discover the list of page IDs to schedule.
- Each subprocess is: <python> manage.py manual_parser --id <ID> [--dry-run] [--force] [--force-name ...]
"""

from __future__ import annotations

import argparse
import os
import sys
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from subprocess import Popen, PIPE
from typing import Iterable, List, Optional, Sequence, Set, Tuple

# --- Resolve repo root and manage.py path ---
THIS_DIR = os.path.dirname(os.path.abspath(__file__))
REPO_ROOT = os.path.abspath(os.path.join(THIS_DIR, os.pardir))
if REPO_ROOT not in sys.path:
    sys.path.insert(0, REPO_ROOT)
MANAGE_PY = os.path.join(REPO_ROOT, "manage.py")

# --- Django setup (read-only ORM usage) ---
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "NetworkMonitoring.settings")
try:
    import django  # type: ignore
    django.setup()
except Exception as e:
    # Provide a clearer hint if settings module differs in your project
    sys.stderr.write(
        "[WARN] Could not initialize Django with settings 'NetworkMonitoring.settings'. "
        "If your project uses a different settings module, run with: \n"
        "       set DJANGO_SETTINGS_MODULE=yourproj.settings\n"
        f"       (Current error: {e})\n\n"
    )
    raise

from django.db.models import Q
from extractly.models import SourceManual, NetworkMonitoredPage


def chunked(seq: List[int], n: int) -> Iterable[List[int]]:
    if n <= 0:
        n = 1
    for i in range(0, len(seq), n):
        yield seq[i : i + n]


def find_manuals(names: Set[str]) -> List[SourceManual]:
    """Match by manual name or its linked source title/name (case-insensitive)."""
    if not names:
        return list(SourceManual.objects.filter(enable=True))
    lowered = {n.strip().lower() for n in names if n and n.strip()}
    qs = SourceManual.objects.filter(enable=True)
    out: List[SourceManual] = []
    for m in qs:
        cand = {
            (m.name or "").lower(),
            (m.title or "").lower(),
            (getattr(m.source, "title", "") or "").lower(),
            (getattr(m.source, "name", "") or "").lower(),
        }
        if cand & lowered:
            out.append(m)
    return out


def select_page_ids(
    manual: SourceManual,
    max_count: Optional[int],
    since_id: Optional[int],
    until_id: Optional[int],
    forced: bool,
    name_filter: Optional[str],
) -> List[int]:
    """Replicate selection logic from process_manual_queue for a single manual."""
    if forced:
        qs = (
            NetworkMonitoredPage.objects.filter(source=manual.source)
            .exclude(
                Q(sliced_html__isnull=True)
                | Q(sliced_html__exact="")
                | Q(sliced_html__exact="{}")
                | Q(sliced_html__exact="[]")
                | Q(sliced_html__exact=" ")
                | Q(html__isnull=True)
                | Q(html__exact="")
                | Q(html__exact="error")
                | Q(html__exact="{}")
                | Q(html__exact="[]")
                | Q(html__exact=" ")
            )
            .order_by("id")
        )
    else:
        qs = (
            NetworkMonitoredPage.objects.filter(
                network_ad_manual__isnull=True, source=manual.source
            )
            .exclude(
                Q(sliced_html__isnull=True)
                | Q(sliced_html__exact="")
                | Q(sliced_html__exact="{}")
                | Q(sliced_html__exact="[]")
                | Q(sliced_html__exact=" ")
                | Q(html__isnull=True)
                | Q(html__exact="")
                | Q(html__exact="{}")
                | Q(html__exact="[]")
                | Q(html__exact=" ")
            )
            .order_by("id")
        )

    if since_id:
        qs = qs.filter(id__gte=since_id)
    if until_id:
        qs = qs.filter(id__lte=until_id)
    if name_filter:
        qs = qs.filter(name__icontains=name_filter)

    ids = list(qs.values_list("id", flat=True)[: (max_count or 10_000_000)])
    return ids


def run_manage_for_id(
    page_id: int,
    manage_py: str,
    py_exe: str,
    dry_run: bool,
    force: bool,
    force_names: Optional[Set[str]],
    cwd: Optional[str],
) -> Tuple[int, int]:
    """Run one subprocess for a given page id; returns (id, exit_code)."""
    cmd = [py_exe, manage_py, "manual_parser", "--id", str(page_id)]
    if dry_run:
        cmd.append("--dry-run")
    if force:
        cmd.append("--force")
    if force_names:
        cmd.extend(["--force-name", ",".join(sorted(force_names))])

    env = os.environ.copy()
    env.setdefault("PYTHONUNBUFFERED", "1")

    proc = Popen(cmd, stdout=PIPE, stderr=PIPE, cwd=cwd, env=env, text=True)
    out, err = proc.communicate()
    if proc.returncode == 0:
        print(f"[OK ] id={page_id}")
    else:
        print(f"[ERR] id={page_id} rc={proc.returncode}")
        if err:
            try:
                print(err.strip())
            except Exception:
                pass
        if out:
            try:
                print(out.strip())
            except Exception:
                pass
    return page_id, proc.returncode


def main(argv: Sequence[str] | None = None) -> int:
    ap = argparse.ArgumentParser(description="Parallel orchestrator for manual_parser (per-ID safe mode)")
    ap.add_argument("--name", type=str, required=True,
                    help="Manual name(s) or source names, comma-separated (e.g., 'dobry' or 'dobry,otodom')")
    ap.add_argument("--workers", type=int, default=min(8, os.cpu_count() or 4),
                    help="Concurrent subprocesses (default = min(8, CPU cores))")
    ap.add_argument("--max", type=int, default=0,
                    help="Max total pages to process (0 = all)")
    ap.add_argument("--batch-size", type=int, default=1,
                    help="IDs per worker task (1 recommended; >1 runs them sequentially within the same worker)")
    ap.add_argument("--since-id", type=int, default=0, help="Lower bound page id (inclusive)")
    ap.add_argument("--until-id", type=int, default=0, help="Upper bound page id (inclusive)")
    ap.add_argument("--filter-name", type=str, default="",
                    help="Filter NetworkMonitoredPage.name via icontains (optional)")
    ap.add_argument("--dry-run", action="store_true", help="Pass --dry-run to manage.py")
    ap.add_argument("--force", action="store_true", help="Pass --force to manage.py")
    ap.add_argument("--force-name", type=str, default="",
                    help="Pass --force-name to manage.py (comma-separated)")
    ap.add_argument("--python", type=str, default=sys.executable,
                    help="Python executable to invoke manage.py (defaults to current interpreter)")
    ap.add_argument("--manage", type=str, default=MANAGE_PY, help="Path to manage.py")
    ap.add_argument("--cwd", type=str, default=REPO_ROOT, help="Working directory for subprocess calls")

    args = ap.parse_args(argv)

    wanted_names: Set[str] = {s.strip() for s in (args.name or "").split(",") if s and s.strip()}
    if not wanted_names:
        print("No --name provided.")
        return 2

    manuals = find_manuals(wanted_names)
    if not manuals:
        print("No matching manuals found for:", ", ".join(sorted(wanted_names)))
        return 3

    # Gather IDs across all manuals
    all_ids: List[int] = []
    for m in manuals:
        ids = select_page_ids(
            manual=m,
            max_count=None,
            since_id=(args.since_id or None),
            until_id=(args.until_id or None),
            forced=args.force,
            name_filter=(args.filter_name or None),
        )
        all_ids.extend(ids)

    # Cap total by --max if set
    if args.max and args.max > 0:
        all_ids = all_ids[: args.max]

    # De-duplicate while preserving order
    seen = set()
    uniq_ids: List[int] = []
    for pid in all_ids:
        if pid not in seen:
            seen.add(pid)
            uniq_ids.append(pid)

    if not uniq_ids:
        print("No pages to process.")
        return 0

    # Prepare jobs (optionally group in batches)
    batches: List[List[int]] = list(chunked(uniq_ids, max(1, int(args.batch_size))))
    print(f"Planning to process {len(uniq_ids)} page(s) across {len(batches)} batch(es) with {args.workers} worker(s).")

    force_names: Set[str] = {s.strip().lower() for s in (args.force_name or "").split(",") if s and s.strip()}

    start = time.time()
    ok = 0
    total = 0

    def run_batch(batch: List[int]) -> Tuple[int, int]:
        last_rc = 0
        last_id = 0
        for pid in batch:
            last_id, last_rc = run_manage_for_id(
                pid, args.manage, args.python, args.dry_run, args.force, force_names or None, args.cwd
            )
        return last_id, last_rc

    with ThreadPoolExecutor(max_workers=max(1, int(args.workers))) as ex:
        futs = [ex.submit(run_batch, b) for b in batches]
        for fut in as_completed(futs):
            _last_id, rc = fut.result()
            total += 1
            if rc == 0:
                ok += 1

    elapsed = time.time() - start
    print(f"Done. OK: {ok}, FAIL: {total - ok}, elapsed: {elapsed:.1f}s")
    return 0 if ok == total else 1


if __name__ == "__main__":
    raise SystemExit(main())
