#!/usr/bin/env python3
"""
parallel_manual_parser_v3.py

Enhanced parallel runner with improved error handling, monitoring, and performance.

Key Improvements:
- Retry mechanism for failed processes
- Progress tracking with rich progress bars
- Better resource management and monitoring
- Graceful interrupt handling
- Detailed logging and statistics
- Resume capability for interrupted runs
- Memory-efficient batch processing
- Configuration file support

Examples:
  python scripts/parallel_manual_parser_v3.py --name dobry --workers 8 --max 1000
  python scripts/parallel_manual_parser_v3.py --config config/parser.json
  python scripts/parallel_manual_parser_v3.py --name dobry --resume --state-file .parser_state.json
"""

from __future__ import annotations

import argparse
import json
import logging
import os
import signal
import sys
import time
import traceback
from concurrent.futures import ProcessPoolExecutor, as_completed, Future
from dataclasses import dataclass, asdict
from pathlib import Path
from subprocess import Popen, PIPE, TimeoutExpired
from typing import Dict, Iterable, List, Optional, Set, Tuple, Any
from collections import defaultdict, deque
import threading
import psutil
from datetime import datetime, timedelta

# Progress bar support (optional)
try:
    from tqdm import tqdm
    HAS_TQDM = True
except ImportError:
    HAS_TQDM = False
    class tqdm:
        def __init__(self, *args, **kwargs):
            self.total = kwargs.get('total', 0)
            self.current = 0
            
        def update(self, n=1):
            self.current += n
            if self.total > 0:
                pct = (self.current / self.total) * 100
                print(f"\rProgress: {self.current}/{self.total} ({pct:.1f}%)", end='', flush=True)
        
        def close(self):
            print()
            
        def __enter__(self):
            return self
            
        def __exit__(self, *args):
            self.close()

# --- Setup paths and Django ---
THIS_DIR = Path(__file__).parent.absolute()
REPO_ROOT = THIS_DIR.parent
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

MANAGE_PY = REPO_ROOT / "manage.py"

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('parallel_parser.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Django setup with better error handling
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "NetworkMonitoring.settings")
try:
    import django
    django.setup()
    from django.db.models import Q
    from extractly.models import SourceManual, NetworkMonitoredPage
    logger.info("Django initialized successfully")
except Exception as e:
    logger.error(f"Failed to initialize Django: {e}")
    sys.stderr.write(
        f"[ERROR] Could not initialize Django: {e}\n"
        "Make sure DJANGO_SETTINGS_MODULE is correct and dependencies are installed.\n"
    )
    raise


@dataclass
class TaskResult:
    """Result of processing a single page ID."""
    page_id: int
    manual_name: str
    exit_code: int
    duration: float
    stdout: str = ""
    stderr: str = ""
    retries: int = 0
    timestamp: float = 0.0


@dataclass
class ProcessingStats:
    """Statistics for the entire processing run."""
    total_pages: int = 0
    completed: int = 0
    failed: int = 0
    skipped: int = 0
    retried: int = 0
    start_time: float = 0.0
    end_time: float = 0.0
    avg_duration: float = 0.0
    peak_memory_mb: float = 0.0
    
    def success_rate(self) -> float:
        return (self.completed / max(1, self.total_pages)) * 100
    
    def total_duration(self) -> float:
        return self.end_time - self.start_time if self.end_time > self.start_time else 0.0


@dataclass
class ParserConfig:
    """Configuration for the parallel parser."""
    manuals: List[str]
    workers: int = 8
    max_pages: int = 0
    per_domain_max: int = 0  # Uniform per-manual cap when multiple manuals
    per_domain_max_map: Dict[str, int] | None = None  # Manual-specific caps
    batch_size: int = 1
    since_id: Optional[int] = None
    until_id: Optional[int] = None
    filter_name: str = ""
    dry_run: bool = False
    force: bool = False
    force_names: List[str] = None
    use_description_scraper: bool = False
    python_exe: str = sys.executable
    manage_py: str = str(MANAGE_PY)
    cwd: str = str(REPO_ROOT)
    timeout: int = 300  # 5 minutes per page
    max_retries: int = 2
    retry_delay: float = 1.0
    state_file: str = ".parser_state.json"
    log_level: str = "INFO"
    memory_limit_mb: int = 4096
    auto_workers: bool = False  # Enable automatic worker optimization
    adaptive_workers: bool = False  # Enable runtime worker adjustment
    min_workers: int = 2  # Minimum workers for auto mode
    max_workers_limit: int = 0  # 0 = no limit, let auto-detect
    
    def __post_init__(self):
        if self.force_names is None:
            self.force_names = []
        if self.per_domain_max_map is None:
            self.per_domain_max_map = {}


class WorkerOptimizer:
    """Automatically determine and optimize worker count."""
    
    def __init__(self, config: ParserConfig):
        self.config = config
        self.system_info = self._get_system_info()
        self.performance_history = deque(maxlen=50)  # Track recent performance
        self.last_adjustment = 0
        self.adjustment_cooldown = 30  # seconds between adjustments
        
    def _get_system_info(self) -> Dict[str, Any]:
        """Get comprehensive system information."""
        try:
            # Test database connection limits
            db_connections = self._test_database_connections()
        except Exception as e:
            logger.warning(f"Could not determine database limits: {e}")
            db_connections = 100  # Conservative default
            
        return {
            'cpu_cores': os.cpu_count() or 4,
            'memory_gb': psutil.virtual_memory().total / (1024**3),
            'available_memory_gb': psutil.virtual_memory().available / (1024**3),
            'os_limit': 61 if os.name == 'nt' else 200,
            'db_connections': db_connections,
            'is_windows': os.name == 'nt'
        }
    
    def _test_database_connections(self) -> int:
        """Test available database connections."""
        try:
            from django.db import connection
            with connection.cursor() as cursor:
                # Try to get max connections
                try:
                    cursor.execute("SHOW max_connections;")  # PostgreSQL
                    result = cursor.fetchone()
                    return int(result[0]) - 20  # Reserve 20 connections
                except:
                    try:
                        cursor.execute("SHOW VARIABLES LIKE 'max_connections';")  # MySQL
                        result = cursor.fetchone()
                        return int(result[1]) - 20
                    except:
                        return 100  # Default conservative limit
        except Exception:
            return 100
    
    def _test_process_limit(self) -> int:
        """Test actual process creation limit."""
        logger.info("Testing process creation limits...")
        
        def quick_task(x):
            time.sleep(0.1)
            return x
        
        max_workers = self.config.min_workers
        
        for test_workers in [4, 8, 16, 24, 32, 48, 61, 80, 100]:
            if test_workers < self.config.min_workers:
                continue
                
            try:
                logger.debug(f"Testing {test_workers} workers...")
                with ProcessPoolExecutor(max_workers=test_workers) as executor:
                    futures = [executor.submit(quick_task, i) for i in range(min(test_workers, 10))]
                    [f.result(timeout=5) for f in futures]
                
                max_workers = test_workers
                
                # Stop at OS limit for Windows
                if self.system_info['is_windows'] and test_workers >= 61:
                    break
                    
            except Exception as e:
                logger.debug(f"Failed at {test_workers} workers: {e}")
                break
        
        logger.info(f"Process limit test: {max_workers} workers")
        return max_workers
    
    def _estimate_memory_per_worker(self) -> float:
        """Estimate memory usage per worker."""
        logger.info("Estimating memory usage per worker...")
        
        def memory_test_task(x):
            # Simulate Django loading and basic processing
            import time
            data = "test" * 1024 * 1024  # 4MB of test data
            time.sleep(0.5)
            return len(data)
        
        initial_memory = psutil.virtual_memory().used
        
        try:
            test_workers = min(4, self.system_info['cpu_cores'])
            with ProcessPoolExecutor(max_workers=test_workers) as executor:
                futures = [executor.submit(memory_test_task, i) for i in range(test_workers)]
                [f.result(timeout=10) for f in futures]
            
            time.sleep(1)  # Allow memory to stabilize
            peak_memory = psutil.virtual_memory().used
            memory_per_worker = (peak_memory - initial_memory) / (1024 * 1024 * test_workers)
            
            # Add overhead estimates
            base_memory = 50  # Base Python + Django
            processing_memory = max(50, memory_per_worker)  # Actual processing
            total_per_worker = base_memory + processing_memory
            
            logger.info(f"Estimated memory per worker: {total_per_worker:.1f} MB")
            return total_per_worker
            
        except Exception as e:
            logger.warning(f"Memory estimation failed: {e}")
            return 150  # Conservative default
    
    def determine_optimal_workers(self) -> int:
        """Determine the optimal number of workers automatically."""
        logger.info("🔍 Auto-detecting optimal worker count...")
        
        # Get system limits
        cpu_limit = self.system_info['cpu_cores']
        memory_per_worker = self._estimate_memory_per_worker()
        process_limit = self._test_process_limit()
        
        # Calculate memory-based limit (use 75% of available memory)
        available_memory_mb = self.system_info['available_memory_gb'] * 1024 * 0.75
        memory_limit = int(available_memory_mb / memory_per_worker)
        
        # Database connection limit
        db_limit = self.system_info['db_connections']
        
        # Apply user-defined limits
        max_limit = self.config.max_workers_limit or 999
        
        # Calculate optimal
        limits = {
            'cpu_cores': cpu_limit,
            'hyperthreading': cpu_limit * 2,  # For I/O bound operations
            'memory': memory_limit,
            'process_limit': process_limit,
            'database': db_limit,
            'user_limit': max_limit
        }
        
        # Choose the most restrictive limit
        optimal = min(limits.values())
        optimal = max(optimal, self.config.min_workers)  # Ensure minimum
        
        # Log the analysis
        logger.info("🔍 Worker Limit Analysis:")
        for limit_type, limit_value in limits.items():
            status = "🚫" if limit_value == optimal else "✅"
            logger.info(f"  {status} {limit_type}: {limit_value}")
        
        logger.info(f"🎯 Optimal workers determined: {optimal}")
        
        # Performance validation
        if not self.config.dry_run:
            validated_workers = self._validate_performance(optimal)
            if validated_workers != optimal:
                logger.info(f"🎯 Performance-validated workers: {validated_workers}")
                return validated_workers
        
        return optimal
    
    def _validate_performance(self, initial_workers: int) -> int:
        """Validate optimal workers with a small performance test."""
        logger.info(f"🧪 Validating performance with {initial_workers} workers...")
        
        # Test with a small subset of actual work
        test_results = {}
        
        for test_workers in [
            max(2, initial_workers // 2),  # Half
            initial_workers,                # Calculated optimal
            min(initial_workers + 4, self.system_info['os_limit'])  # Slightly more
        ]:
            if test_workers in test_results:
                continue
                
            try:
                start_time = time.time()
                
                # Run a quick test with actual manage.py calls
                test_ids = self._get_test_page_ids(10)  # Small test sample
                if not test_ids:
                    logger.warning("No test pages available, skipping performance validation")
                    return initial_workers
                
                batches = list(chunked(test_ids, 1))
                batch_configs = [(batch, self.config) for batch in batches[:test_workers]]
                
                with ProcessPoolExecutor(max_workers=test_workers) as executor:
                    futures = [executor.submit(process_batch_worker, bc) for bc in batch_configs]
                    results = [f.result(timeout=60) for f in futures]  # 1 minute timeout
                
                duration = time.time() - start_time
                success_count = sum(1 for batch in results for result in batch if result.exit_code == 0)
                
                test_results[test_workers] = {
                    'duration': duration,
                    'success_rate': success_count / len(test_ids) if test_ids else 0,
                    'throughput': len(test_ids) / duration if duration > 0 else 0
                }
                
                logger.debug(f"Test {test_workers} workers: {duration:.1f}s, "
                           f"throughput: {test_results[test_workers]['throughput']:.2f}/s")
                
            except Exception as e:
                logger.warning(f"Performance test failed for {test_workers} workers: {e}")
        
        # Choose the configuration with best throughput
        if test_results:
            best_workers = max(test_results.keys(), 
                             key=lambda w: test_results[w]['throughput'])
            logger.info(f"🏆 Best performing configuration: {best_workers} workers "
                       f"({test_results[best_workers]['throughput']:.2f} pages/sec)")
            return best_workers
        
        return initial_workers
    
    def _get_test_page_ids(self, count: int) -> List[int]:
        """Get a small sample of page IDs for testing."""
        try:
            # Use the same logic as main script but limit to small sample
            manual_set = set(self.config.manuals) if self.config.manuals else set()
            manuals = find_manuals(manual_set)
            
            if not manuals:
                return []
            
            # Just use first manual for testing
            manual = manuals[0]
            base_qs = build_base_queryset(
                manual, 
                self.config.force, 
                self.config.since_id, 
                self.config.until_id, 
                self.config.filter_name
            )
            
            return list(base_qs.values_list("id", flat=True)[:count])
            
        except Exception as e:
            logger.warning(f"Could not get test page IDs: {e}")
            return []
    
    def should_adjust_workers(self, current_performance: Dict) -> bool:
        """Determine if worker count should be adjusted."""
        if not self.config.adaptive_workers:
            return False
            
        # Don't adjust too frequently
        if time.time() - self.last_adjustment < self.adjustment_cooldown:
            return False
        
        # Store performance metrics
        self.performance_history.append({
            'timestamp': time.time(),
            'workers': current_performance.get('workers', 0),
            'throughput': current_performance.get('throughput', 0),
            'memory_usage': psutil.virtual_memory().percent,
            'cpu_usage': psutil.cpu_percent()
        })
        
        # Need at least 5 data points for trend analysis
        if len(self.performance_history) < 5:
            return False
        
        # Analyze trends
        recent = list(self.performance_history)[-5:]
        avg_throughput = sum(p['throughput'] for p in recent) / len(recent)
        avg_memory = sum(p['memory_usage'] for p in recent) / len(recent)
        avg_cpu = sum(p['cpu_usage'] for p in recent) / len(recent)
        
        # Conditions for adjustment
        should_increase = (
            avg_cpu < 70 and  # CPU not saturated
            avg_memory < 80 and  # Memory available
            avg_throughput > 0  # Making progress
        )
        
        should_decrease = (
            avg_memory > 90 or  # High memory usage
            avg_cpu > 95 or     # CPU saturated
            avg_throughput < 0.1  # Very low throughput
        )
        
        return should_increase or should_decrease
    
    def adjust_workers(self, current_workers: int, current_performance: Dict) -> int:
        """Adjust worker count based on current performance."""
        if not self.should_adjust_workers(current_performance):
            return current_workers
        
        recent = list(self.performance_history)[-5:]
        avg_memory = sum(p['memory_usage'] for p in recent) / len(recent)
        avg_cpu = sum(p['cpu_usage'] for p in recent) / len(recent)
        
        new_workers = current_workers
        
        if avg_memory > 90 or avg_cpu > 95:
            # Reduce workers
            new_workers = max(self.config.min_workers, current_workers - 2)
            logger.info(f"📉 Reducing workers: {current_workers} → {new_workers} "
                       f"(Memory: {avg_memory:.1f}%, CPU: {avg_cpu:.1f}%)")
        
        elif avg_cpu < 70 and avg_memory < 80:
            # Increase workers
            max_increase = min(
                self.system_info['os_limit'],
                self.config.max_workers_limit or 999
            )
            new_workers = min(max_increase, current_workers + 2)
            if new_workers > current_workers:
                logger.info(f"📈 Increasing workers: {current_workers} → {new_workers}")
        
        if new_workers != current_workers:
            self.last_adjustment = time.time()
        
        return new_workers


class ProcessMonitor:
    """Monitor system resources during processing."""
    
    def __init__(self):
        self.peak_memory = 0.0
        self.current_memory = 0.0
        self.cpu_percent = 0.0
        self.running = False
        self.thread = None
        
    def start(self):
        self.running = True
        self.thread = threading.Thread(target=self._monitor)
        self.thread.daemon = True
        self.thread.start()
        
    def stop(self):
        self.running = False
        if self.thread:
            self.thread.join(timeout=1.0)
            
    def _monitor(self):
        process = psutil.Process()
        while self.running:
            try:
                mem_info = process.memory_info()
                self.current_memory = mem_info.rss / 1024 / 1024  # MB
                self.peak_memory = max(self.peak_memory, self.current_memory)
                self.cpu_percent = process.cpu_percent()
                time.sleep(1.0)
            except Exception:
                pass


class StateManager:
    """Manage parser state for resume capability."""
    
    def __init__(self, state_file: str):
        self.state_file = Path(state_file)
        self.state = {
            'completed_ids': set(),
            'failed_ids': set(),
            'stats': {},
            'config': {},
            'last_update': time.time()
        }
        self.lock = threading.Lock()
        
    def load(self) -> bool:
        """Load state from file. Returns True if loaded successfully."""
        try:
            if self.state_file.exists():
                with open(self.state_file, 'r') as f:
                    data = json.load(f)
                    self.state['completed_ids'] = set(data.get('completed_ids', []))
                    self.state['failed_ids'] = set(data.get('failed_ids', []))
                    self.state['stats'] = data.get('stats', {})
                    self.state['config'] = data.get('config', {})
                    self.state['last_update'] = data.get('last_update', time.time())
                logger.info(f"Loaded state: {len(self.state['completed_ids'])} completed, "
                           f"{len(self.state['failed_ids'])} failed")
                return True
        except Exception as e:
            logger.warning(f"Could not load state file: {e}")
        return False
        
    def save(self):
        """Save current state to file."""
        with self.lock:
            try:
                data = {
                    'completed_ids': list(self.state['completed_ids']),
                    'failed_ids': list(self.state['failed_ids']),
                    'stats': self.state['stats'],
                    'config': self.state['config'],
                    'last_update': time.time()
                }
                with open(self.state_file, 'w') as f:
                    json.dump(data, f, indent=2)
            except Exception as e:
                logger.error(f"Could not save state: {e}")
                
    def mark_completed(self, page_id: int):
        with self.lock:
            self.state['completed_ids'].add(page_id)
            self.state['failed_ids'].discard(page_id)
            
    def mark_failed(self, page_id: int):
        with self.lock:
            self.state['failed_ids'].add(page_id)
            
    def is_completed(self, page_id: int) -> bool:
        return page_id in self.state['completed_ids']
        
    def get_pending_ids(self, all_ids: List[Tuple[int, str]]) -> List[Tuple[int, str]]:
        """Filter out already completed IDs."""
        return [(pid, mname) for pid, mname in all_ids if not self.is_completed(pid)]


class InterruptHandler:
    """Handle graceful shutdown on interrupt signals."""
    
    def __init__(self):
        self.interrupted = False
        self.original_handlers = {}
        
    def __enter__(self):
        self.original_handlers[signal.SIGINT] = signal.signal(signal.SIGINT, self._handle_interrupt)
        if hasattr(signal, 'SIGTERM'):
            self.original_handlers[signal.SIGTERM] = signal.signal(signal.SIGTERM, self._handle_interrupt)
        return self
        
    def __exit__(self, exc_type, exc_val, exc_tb):
        for sig, handler in self.original_handlers.items():
            signal.signal(sig, handler)
            
    def _handle_interrupt(self, signum, frame):
        logger.info("Received interrupt signal, shutting down gracefully...")
        self.interrupted = True


def chunked(seq: List[int], n: int) -> Iterable[List[int]]:
    """Split sequence into chunks of size n."""
    if n <= 0:
        n = 1
    for i in range(0, len(seq), n):
        yield seq[i:i + n]


def find_manuals(names: Set[str]) -> List[SourceManual]:
    """Find manuals by name with better error reporting."""
    if not names:
        manuals = list(SourceManual.objects.filter(enable=True))
        logger.info(f"Found {len(manuals)} enabled manuals")
        return manuals
        
    lowered = {n.strip().lower() for n in names if n and n.strip()}
    qs = SourceManual.objects.filter(enable=True)
    found: List[SourceManual] = []
    
    for manual in qs:
        candidates = {
            (manual.name or "").lower(),
            (manual.title or "").lower(),
            (getattr(manual.source, "title", "") or "").lower(),
            (getattr(manual.source, "name", "") or "").lower(),
        }
        if candidates & lowered:
            found.append(manual)
            logger.debug(f"Matched manual: {manual.name} ({manual.title})")
            
    if not found:
        available = [m.name for m in qs if m.name]
        logger.error(f"No manuals found for {sorted(names)}. Available: {available}")
    else:
        logger.info(f"Found {len(found)} matching manuals")
        
    return found


def select_page_ids_batch(
    manual: SourceManual,
    max_count: Optional[int],
    since_id: Optional[int],
    until_id: Optional[int],
    forced: bool,
    name_filter: Optional[str],
    batch_size: int = 1000
) -> Iterable[List[int]]:
    """Memory-efficient batch selection of page IDs."""
    base_qs = build_base_queryset(manual, forced, since_id, until_id, name_filter)
    
    processed = 0
    offset = 0
    
    while True:
        if max_count and processed >= max_count:
            break
            
        current_batch_size = min(batch_size, (max_count - processed) if max_count else batch_size)
        batch_ids = list(base_qs.values_list("id", flat=True)[offset:offset + current_batch_size])
        
        if not batch_ids:
            break
            
        yield batch_ids
        processed += len(batch_ids)
        offset += current_batch_size
        
        logger.debug(f"Selected batch: {len(batch_ids)} IDs (total: {processed})")


def build_base_queryset(manual, forced, since_id, until_id, name_filter):
    """Build the base queryset for page selection."""
    if forced:
        qs = (
            NetworkMonitoredPage.objects.filter(source=manual.source)
            .exclude(
                Q(sliced_html__isnull=True)
                | Q(sliced_html__exact="")
                | Q(sliced_html__exact="{}")
                | Q(sliced_html__exact="[]")
                | Q(sliced_html__exact=" ")
                | Q(html__isnull=True)
                | Q(html__exact="")
                | Q(html__exact="error")
                | Q(html__exact="{}")
                | Q(html__exact="[]")
                | Q(html__exact=" ")
            )
            .order_by("id")
        )
    else:
        qs = (
            NetworkMonitoredPage.objects.filter(
                network_ad_manual__isnull=True, source=manual.source
            )
            .exclude(
                Q(sliced_html__isnull=True)
                | Q(sliced_html__exact="")
                | Q(sliced_html__exact="{}")
                | Q(sliced_html__exact="[]")
                | Q(sliced_html__exact=" ")
                | Q(html__isnull=True)
                | Q(html__exact="")
                | Q(html__exact="{}")
                | Q(html__exact="[]")
                | Q(html__exact=" ")
            )
            .order_by("id")
        )

    if since_id:
        qs = qs.filter(id__gte=since_id)
    if until_id:
        qs = qs.filter(id__lte=until_id)
    if name_filter:
        qs = qs.filter(name__icontains=name_filter)
        
    return qs


def run_manage_for_id_with_retry(
    page_data: Tuple[int, str],  # (page_id, manual_name)
    config: ParserConfig,
    max_retries: int = 2
) -> TaskResult:
    """Run manage.py for a page ID with retry logic."""
    page_id, manual_name = page_data
    start_time = time.time()
    result = TaskResult(page_id=page_id, manual_name=manual_name, exit_code=1, duration=0.0, timestamp=start_time)
    
    for attempt in range(max_retries + 1):
        try:
            cmd = build_command(page_id, manual_name, config)
            
            # Log the actual command being executed (first attempt only)
            if attempt == 0:
                logger.info(f"Executing: {' '.join(cmd)}")
            
            env = os.environ.copy()
            env.setdefault("PYTHONUNBUFFERED", "1")
            env.setdefault("PYTHONIOENCODING", "utf-8")  # Force UTF-8 encoding
            
            proc = Popen(
                cmd, 
                stdout=PIPE, 
                stderr=PIPE, 
                cwd=config.cwd, 
                env=env, 
                text=True,
                encoding='utf-8',
                errors='replace'  # Replace invalid characters instead of failing
            )
            
            try:
                stdout, stderr = proc.communicate(timeout=config.timeout)
                result.exit_code = proc.returncode
                result.stdout = stdout
                result.stderr = stderr
                result.retries = attempt
                
                if proc.returncode == 0:
                    logger.info(f"✓ Success: page_id={page_id} manual={manual_name} (attempt {attempt + 1})")
                    # Log stdout if it contains useful info
                    if stdout and len(stdout.strip()) > 0:
                        logger.debug(f"  stdout: {stdout.strip()[:500]}")
                    break
                else:
                    logger.warning(f"✗ Failed: page_id={page_id} manual={manual_name} rc={proc.returncode} (attempt {attempt + 1})")
                    # Log stderr for debugging
                    if stderr and len(stderr.strip()) > 0:
                        logger.warning(f"  stderr: {stderr.strip()[:500]}")
                    if stdout and len(stdout.strip()) > 0:
                        logger.warning(f"  stdout: {stdout.strip()[:500]}")
                    if attempt < max_retries:
                        time.sleep(config.retry_delay * (2 ** attempt))  # Exponential backoff
                        
            except TimeoutExpired:
                proc.kill()
                proc.communicate()
                logger.error(f"Timeout: page_id={page_id} (attempt {attempt + 1})")
                if attempt < max_retries:
                    time.sleep(config.retry_delay)
                    
        except Exception as e:
            logger.error(f"Exception processing page_id={page_id}: {e}")
            if attempt < max_retries:
                time.sleep(config.retry_delay)
    
    result.duration = time.time() - start_time
    return result


def build_command(page_id: int, manual_name: str, config: ParserConfig) -> List[str]:
    """Build the command to run manage.py."""
    cmd = [config.python_exe, config.manage_py, "manual_parser", "--id", str(page_id), "--name", manual_name]
    
    if config.dry_run:
        cmd.append("--dry-run")
    if config.force:
        cmd.append("--force")
    if config.force_names:
        cmd.extend(["--force-name", ",".join(config.force_names)])
    if config.use_description_scraper:
        cmd.append("--use-description-scraper")
        
    return cmd


def process_batch_worker(batch_and_config: Tuple[List[Tuple[int, str]], ParserConfig]) -> List[TaskResult]:
    """Worker function to process a batch of (page_id, manual_name) tuples."""
    batch, config = batch_and_config
    results = []
    
    for page_data in batch:
        result = run_manage_for_id_with_retry(page_data, config, config.max_retries)
        results.append(result)
        
    return results


def load_config_file(config_path: str) -> Dict[str, Any]:
    """Load configuration from JSON file."""
    try:
        with open(config_path, 'r') as f:
            return json.load(f)
    except Exception as e:
        logger.error(f"Could not load config file {config_path}: {e}")
        return {}


def main(argv: Optional[List[str]] = None) -> int:
    """Main entry point."""
    parser = argparse.ArgumentParser(
        description="Enhanced parallel orchestrator for manual_parser",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  %(prog)s --name dobry --workers 8 --max 1000
  %(prog)s --name dobry,otodom --workers 6 --max 3000 --timeout 600
  %(prog)s --name nonline --workers 4 --max 500 --use-description-scraper
  %(prog)s --config config/parser.json
  %(prog)s --name dobry --resume --state-file .parser_state.json
        """
    )
    
    # Configuration options
    parser.add_argument("--config", help="Load config from JSON file")
    parser.add_argument("--name", required=True, help="Manual name(s), comma-separated. Use 'all' or '*' for all enabled")
    parser.add_argument("--workers", type=int, default=0, help="Number of worker processes (0 = auto-detect)")
    parser.add_argument("--auto-workers", action="store_true", help="Automatically determine optimal worker count")
    parser.add_argument("--adaptive-workers", action="store_true", help="Dynamically adjust workers during runtime")
    parser.add_argument("--min-workers", type=int, default=2, help="Minimum workers for auto mode")
    parser.add_argument("--max-workers-limit", type=int, default=0, help="Maximum workers limit (0 = no limit)")
    parser.add_argument("--max", type=int, default=0, help="Maximum pages to process")
    parser.add_argument("--per-domain-max", type=int, default=0, help="Maximum pages per manual when multiple manuals are provided")
    parser.add_argument(
        "--per-domain-max-map",
        default="",
        help="Manual-specific caps as 'name1=100,name2=50' (case-insensitive). Overrides --per-domain-max for matching manuals"
    )
    parser.add_argument("--batch-size", type=int, default=1, help="Pages per batch")
    parser.add_argument("--since-id", type=int, help="Lower bound page ID")
    parser.add_argument("--until-id", type=int, help="Upper bound page ID")
    parser.add_argument("--filter-name", default="", help="Filter by page name")
    parser.add_argument("--dry-run", action="store_true", help="Dry run mode")
    parser.add_argument("--force", action="store_true", help="Force processing")
    parser.add_argument("--force-name", default="", help="Force specific manuals")
    parser.add_argument("--use-description-scraper", action="store_true", help="Enable AI description scraper for missing fields")
    parser.add_argument("--timeout", type=int, default=300, help="Timeout per page (seconds)")
    parser.add_argument("--max-retries", type=int, default=2, help="Max retries per page")
    parser.add_argument("--retry-delay", type=float, default=1.0, help="Delay between retries")
    parser.add_argument("--state-file", default=".parser_state.json", help="State file for resume")
    parser.add_argument("--resume", action="store_true", help="Resume from previous state")
    parser.add_argument("--log-level", default="INFO", help="Logging level")
    parser.add_argument("--memory-limit", type=int, default=4096, help="Memory limit in MB")
    parser.add_argument("--python", default=sys.executable, help="Python executable")
    parser.add_argument("--manage", default=str(MANAGE_PY), help="Path to manage.py")
    parser.add_argument("--cwd", default=str(REPO_ROOT), help="Working directory")
    
    args = parser.parse_args(argv)
    
    # Set logging level
    logging.getLogger().setLevel(getattr(logging, args.log_level.upper()))
    
    # Load config file if specified
    file_config = {}
    if args.config:
        file_config = load_config_file(args.config)
    
    # Build manual list, supporting 'all'/'*'
    parsed_manuals = [s.strip() for s in args.name.split(",") if s.strip()]
    if any(m.lower() in {"all", "*"} for m in parsed_manuals):
        parsed_manuals = []  # Empty set => all enabled manuals

    # Parse per-domain max map
    per_domain_map: Dict[str, int] = {}
    if args.per_domain_max_map:
        for part in args.per_domain_max_map.split(","):
            if not part:
                continue
            if "=" in part:
                k, v = part.split("=", 1)
                k = k.strip().lower()
                try:
                    per_domain_map[k] = int(v.strip())
                except ValueError:
                    logger.warning(f"Ignoring invalid per-domain-max entry: {part}")

    # Build configuration
    config = ParserConfig(
        manuals=parsed_manuals,
        workers=args.workers,
        max_pages=args.max,
        per_domain_max=args.per_domain_max,
        per_domain_max_map=per_domain_map,
        batch_size=args.batch_size,
        since_id=args.since_id,
        until_id=args.until_id,
        filter_name=args.filter_name,
        dry_run=args.dry_run,
        force=args.force,
        force_names=[s.strip() for s in args.force_name.split(",") if s.strip()],
        use_description_scraper=args.use_description_scraper,
        python_exe=args.python,
        manage_py=args.manage,
        cwd=args.cwd,
        timeout=args.timeout,
        max_retries=args.max_retries,
        retry_delay=args.retry_delay,
        state_file=args.state_file,
        log_level=args.log_level,
        memory_limit_mb=args.memory_limit,
        auto_workers=(args.workers == 0 or args.auto_workers),
        adaptive_workers=args.adaptive_workers,
        min_workers=args.min_workers,
        max_workers_limit=args.max_workers_limit
    )
    
    # Apply config file overrides
    for key, value in file_config.items():
        if hasattr(config, key):
            setattr(config, key, value)
    
    logger.info(f"Starting parallel parser with {config.workers} workers")
    logger.info(f"Target manuals: {', '.join(config.manuals)}")
    
    # Initialize state manager
    state_manager = StateManager(config.state_file)
    if args.resume:
        state_manager.load()
    
    # Find manuals
    manual_set = set(config.manuals) if config.manuals else set()
    manuals = find_manuals(manual_set)
    if not manuals:
        logger.error("No matching manuals found")
        return 1
    
    # Initialize monitoring
    monitor = ProcessMonitor()
    stats = ProcessingStats(start_time=time.time())
    
    try:
        with InterruptHandler() as interrupt_handler:
            monitor.start()
            
            # Collect all page IDs
            logger.info("Collecting page IDs...")
            all_ids = []
            
            # Helper to choose per-manual limit
            def get_manual_limit(m) -> Optional[int]:
                # When only one manual, default to global max
                if len(manuals) == 1 and config.max_pages:
                    return config.max_pages
                # Specific mapping overrides
                candidates = [
                    (m.name or "").lower(),
                    (m.title or "").lower(),
                    (getattr(m.source, "name", "") or "").lower(),
                    (getattr(m.source, "title", "") or "").lower(),
                ]
                for key in candidates:
                    if key and key in config.per_domain_max_map:
                        return config.per_domain_max_map[key]
                # Uniform per-domain cap
                return config.per_domain_max or None

            for manual in manuals:
                manual_limit = get_manual_limit(manual)
                manual_name = manual.name or manual.title or f"manual_{manual.id}"
                for batch in select_page_ids_batch(
                    manual=manual,
                    max_count=manual_limit,
                    since_id=config.since_id,
                    until_id=config.until_id,
                    forced=config.force,
                    name_filter=config.filter_name if config.filter_name else None,
                    batch_size=1000
                ):
                    # Store as (page_id, manual_name) tuples
                    all_ids.extend([(pid, manual_name) for pid in batch])
                    if interrupt_handler.interrupted:
                        break
                        
                if interrupt_handler.interrupted:
                    break
            
            # Apply max limit across all manuals
            if config.max_pages > 0:
                all_ids = all_ids[:config.max_pages]
            
            # Filter out completed IDs if resuming
            pending_ids = state_manager.get_pending_ids(all_ids) if args.resume else all_ids
            
            # Remove duplicates while preserving order (keep first occurrence of each page_id)
            seen = set()
            unique_ids = []
            for pid, mname in pending_ids:
                if pid not in seen:
                    seen.add(pid)
                    unique_ids.append((pid, mname))
            
            stats.total_pages = len(unique_ids)
            
            if not unique_ids:
                logger.info("No pages to process")
                return 0
            
            logger.info(f"Processing {len(unique_ids)} pages in batches of {config.batch_size}")
            
            # Create batches
            batches = list(chunked(unique_ids, config.batch_size))
            batch_configs = [(batch, config) for batch in batches]
            
            # Process with progress tracking
            with tqdm(total=len(unique_ids), desc="Processing pages", disable=not HAS_TQDM) as pbar:
                with ProcessPoolExecutor(max_workers=config.workers) as executor:
                    # Submit all batches
                    future_to_batch = {
                        executor.submit(process_batch_worker, batch_config): i 
                        for i, batch_config in enumerate(batch_configs)
                    }
                    
                    # Collect results
                    durations = []
                    for future in as_completed(future_to_batch):
                        if interrupt_handler.interrupted:
                            logger.info("Cancelling remaining tasks...")
                            for f in future_to_batch:
                                f.cancel()
                            break
                            
                        try:
                            batch_results = future.result()
                            
                            for result in batch_results:
                                durations.append(result.duration)
                                
                                if result.exit_code == 0:
                                    stats.completed += 1
                                    state_manager.mark_completed(result.page_id)
                                    logger.debug(f"✓ {result.page_id}")
                                else:
                                    stats.failed += 1
                                    state_manager.mark_failed(result.page_id)
                                    logger.warning(f"✗ {result.page_id} (rc={result.exit_code})")
                                    if result.stderr:
                                        logger.debug(f"  stderr: {result.stderr.strip()}")
                                
                                if result.retries > 0:
                                    stats.retried += 1
                                
                                pbar.update(1)
                            
                            # Save state periodically
                            if stats.completed % 100 == 0:
                                state_manager.save()
                                
                        except Exception as e:
                            logger.error(f"Batch processing error: {e}")
                            logger.debug(traceback.format_exc())
                            stats.failed += config.batch_size
                            pbar.update(config.batch_size)
            
            # Final state save
            state_manager.save()
            
    except KeyboardInterrupt:
        logger.info("Interrupted by user")
        return 130
    except Exception as e:
        logger.error(f"Unexpected error: {e}")
        logger.debug(traceback.format_exc())
        return 1
    finally:
        monitor.stop()
        stats.end_time = time.time()
        stats.peak_memory_mb = monitor.peak_memory
        if durations:
            stats.avg_duration = sum(durations) / len(durations)
    
    # Print final statistics
    print("\n" + "="*60)
    print("PROCESSING COMPLETE")
    print("="*60)
    print(f"Total pages:      {stats.total_pages:,}")
    print(f"Completed:        {stats.completed:,}")
    print(f"Failed:           {stats.failed:,}")
    print(f"Success rate:     {stats.success_rate():.1f}%")
    print(f"Retried:          {stats.retried:,}")
    print(f"Duration:         {stats.total_duration():.1f}s")
    print(f"Avg per page:     {stats.avg_duration:.2f}s")
    print(f"Peak memory:      {stats.peak_memory_mb:.1f} MB")
    
    if stats.failed > 0:
        print(f"\nFailed IDs saved to state file: {config.state_file}")
        print("Run with --resume to retry failed pages")
    
    return 0 if stats.failed == 0 else 1


if __name__ == "__main__":
    raise SystemExit(main())