"""
Django management command to provide real-time analytics data.
Usage: python manage.py realtime_data <command>
"""

from django.core.management.base import BaseCommand
from django.utils import timezone
from django.db.models import Count, Q, F
from extractly.models import AdsManual, NetworkMonitoredPage, SourceNetwork
import json
from datetime import timedelta
from urllib.parse import urlparse


class Command(BaseCommand):
    help = 'Provides real-time analytics data for the monitoring UI'

    def add_arguments(self, parser):
        parser.add_argument(
            'action',
            type=str,
            choices=['server-status', 'overall-performance', 'portal-details', 'live-feed'],
            help='Type of data to retrieve'
        )
        parser.add_argument(
            '--domain',
            type=str,
            help='Portal domain (required for portal-details action)'
        )
        parser.add_argument(
            '--limit',
            type=int,
            default=50,
            help='Number of recent items to fetch (for live-feed action)'
        )

    def handle(self, *args, **options):
        action = options['action']
        
        try:
            if action == 'server-status':
                data = self.get_server_status()
            elif action == 'overall-performance':
                data = self.get_overall_performance()
            elif action == 'portal-details':
                domain = options.get('domain')
                if not domain:
                    data = {'error': 'Domain parameter required for portal-details'}
                else:
                    data = self.get_portal_details(domain)
            elif action == 'live-feed':
                limit = options.get('limit', 50)
                data = self.get_live_feed(limit)
            else:
                data = {'error': 'Unknown action'}
            
            self.stdout.write(json.dumps(data, default=str))
        except Exception as e:
            self.stdout.write(json.dumps({'error': str(e)}))
    
    def get_server_status(self):
        """Check if server is actively updating data."""
        try:
            # Check latest AdsManual entry
            latest_ad = AdsManual.objects.all().order_by('-created_at').first()
            
            if not latest_ad:
                return {
                    'is_live': False,
                    'last_update': None,
                    'minutes_since_update': None,
                    'message': 'No data found in database'
                }
            
            last_update = latest_ad.created_at
            now = timezone.now()
            time_diff = now - last_update
            minutes_since = int(time_diff.total_seconds() / 60)
            
            update_threshold = 15
            is_live = minutes_since < update_threshold
            
            return {
                'is_live': is_live,
                'last_update': last_update.isoformat(),
                'minutes_since_update': minutes_since,
                'message': 'Server is live and updating' if is_live else f'Server offline - Last update {minutes_since} minutes ago'
            }
        except Exception as e:
            return {
                'is_live': False,
                'last_update': None,
                'minutes_since_update': None,
                'message': f'Error: {str(e)}'
            }
    
    def get_overall_performance(self):
        """Get performance metrics for all portals using NetworkMonitoredPage and AdsManual."""
        try:
            # Get all sources (portals)
            sources = SourceNetwork.objects.filter(enabled=True)
            
            portal_list = []
            performances = []
            
            for source in sources:
                domain = urlparse(source.base_url).netloc
                
                # Optimized: Use aggregate for counts instead of .count() queries
                page_stats = NetworkMonitoredPage.objects.filter(source=source).aggregate(
                    total=Count('id'),
                    parsed=Count('id', filter=Q(network_ad_manual__isnull=False))
                )
                total_pages = page_stats['total'] or 0
                parsed_pages = page_stats['parsed'] or 0
                
                # Get AdsManual entries for this portal - use aggregation for performance
                # Calculate selector health by checking for null/empty critical fields
                text_fields = ['city', 'title']
                numeric_fields = ['price', 'square_footage', 'rooms']
                
                # Build Q object for complete ads (all critical fields have values)
                complete_condition = Q()
                
                # For text fields: not null and not empty string
                for field in text_fields:
                    complete_condition &= ~Q(**{field: None}) & ~Q(**{field: ''})
                
                # For numeric fields: not null and greater than 0
                for field in numeric_fields:
                    complete_condition &= ~Q(**{field: None}) & Q(**{f'{field}__gt': 0})
                
                ad_stats = AdsManual.objects.filter(
                    networkmonitoredpage__source=source
                ).aggregate(
                    total=Count('id', distinct=True),
                    complete=Count('id', filter=complete_condition, distinct=True)
                )
                
                total_ads = ad_stats['total'] or 0
                complete_ads = ad_stats['complete'] or 0
                
                # Performance = (complete ads / total monitored pages) * 100
                performance = (complete_ads / total_pages * 100) if total_pages > 0 else 0
                
                # Success rate = (parsed pages / total pages) * 100
                success_rate = (parsed_pages / total_pages * 100) if total_pages > 0 else 0
                
                # Calculate basic fields percentage (14 main fields)
                basic_fields = ['price', 'square_footage', 'city', 'title', 'rooms', 'estate_type', 'offer_type', 'description', 'address', 'floor', 'floors_num', 'bathrooms', 'lat', 'lon']
                basic_complete_count = 0
                
                for ad in AdsManual.objects.filter(networkmonitoredpage__source=source):
                    has_all_basic = True
                    for field in basic_fields:
                        value = getattr(ad, field, None)
                        if value in [None, '', 0, '0']:
                            has_all_basic = False
                            break
                    if has_all_basic:
                        basic_complete_count += 1
                
                basic_performance = (basic_complete_count / total_pages * 100) if total_pages > 0 else 0
                
                if performance >= 90:
                    status = 'excellent'
                elif performance >= 70:
                    status = 'good'
                elif performance >= 50:
                    status = 'fair'
                else:
                    status = 'poor'
                
                portal_data = {
                    'domain': domain,
                    'total_pages': total_pages,
                    'parsed_pages': parsed_pages,
                    'unparsed_pages': total_pages - parsed_pages,
                    'complete_ads': complete_ads,
                    'incomplete_ads': parsed_pages - complete_ads,
                    'performance_percentage': round(performance, 2),
                    'basic_fields_percentage': round(basic_performance, 2),
                    'success_rate': round(success_rate, 2),
                    'status': status
                }
                
                portal_list.append(portal_data)
                performances.append(performance)
            
            # Sort by total pages descending
            portal_list.sort(key=lambda x: x['total_pages'], reverse=True)
            
            avg_performance = sum(performances) / len(performances) if performances else 0
            best_portal = max(portal_list, key=lambda x: x['performance_percentage']) if portal_list else None
            worst_portal = min(portal_list, key=lambda x: x['performance_percentage']) if portal_list else None
            
            return {
                'portals': portal_list,
                'summary': {
                    'total_portals': len(portal_list),
                    'avg_performance': round(avg_performance, 2),
                    'best_portal': best_portal['domain'] if best_portal else None,
                    'worst_portal': worst_portal['domain'] if worst_portal else None
                },
                'timestamp': timezone.now().isoformat()
            }
        except Exception as e:
            return {
                'portals': [],
                'summary': {
                    'total_portals': 0,
                    'avg_performance': 0,
                    'best_portal': None,
                    'worst_portal': None
                },
                'error': str(e),
                'timestamp': timezone.now().isoformat()
            }
    
    def get_portal_details(self, domain):
        """Get detailed analysis for a specific portal."""
        try:
            # Find the source by domain - try multiple matching strategies
            from urllib.parse import urlparse
            
            # Try exact domain match in base_url
            source = SourceNetwork.objects.filter(base_url__icontains=domain).first()
            
            # If not found, try matching just the domain part
            if not source:
                for src in SourceNetwork.objects.filter(enabled=True):
                    src_domain = urlparse(src.base_url).netloc
                    if domain.lower() in src_domain.lower() or src_domain.lower() in domain.lower():
                        source = src
                        break
            
            if not source:
                return {
                    'domain': domain,
                    'error': f'Portal not found. Searched for: {domain}',
                    'timestamp': timezone.now().isoformat()
                }
            
            # Total monitored pages for this portal
            total_pages = NetworkMonitoredPage.objects.filter(source=source).count()
            
            # Pages successfully parsed (have AdsManual entry)
            parsed_pages = NetworkMonitoredPage.objects.filter(
                source=source,
                network_ad_manual__isnull=False
            ).count()
            
            unparsed_pages = total_pages - parsed_pages
            
            # Get all AdsManual entries for this portal
            ads = AdsManual.objects.filter(
                networkmonitoredpage__source=source
            ).distinct()
            
            # Define main/basic important selectors/fields to check
            basic_selector_fields = [
                'price', 'square_footage', 'city', 'title', 'rooms',
                'estate_type', 'offer_type', 'description', 'address',
                'floor', 'floors_num', 'bathrooms', 'lat', 'lon'
            ]
            
            # Get ALL fields from AdsManual model (excluding system fields)
            all_fields = []
            excluded_fields = ['id', 'created_at', 'updated_at', 'incomplete_data', 'network_ad_manual']
            for field in AdsManual._meta.get_fields():
                if field.name not in excluded_fields and not field.many_to_many and not field.one_to_many:
                    all_fields.append(field.name)
            
            # Analyze BASIC fields
            basic_field_stats = {}
            for field in basic_selector_fields:
                populated_count = 0
                for ad in ads:
                    value = getattr(ad, field, None)
                    if value not in [None, '', 0, '0', []]:
                        populated_count += 1
                
                total_ads = ads.count()
                success_rate = (populated_count / total_ads * 100) if total_ads > 0 else 0
                basic_field_stats[field] = {
                    'populated': populated_count,
                    'missing': total_ads - populated_count,
                    'success_rate': success_rate
                }
            
            # Analyze ALL fields
            all_field_stats = {}
            for field in all_fields:
                populated_count = 0
                for ad in ads:
                    try:
                        value = getattr(ad, field, None)
                        if value not in [None, '', 0, '0', []]:
                            populated_count += 1
                    except Exception:
                        continue
                
                total_ads = ads.count()
                success_rate = (populated_count / total_ads * 100) if total_ads > 0 else 0
                all_field_stats[field] = {
                    'populated': populated_count,
                    'missing': total_ads - populated_count,
                    'success_rate': success_rate
                }
            
            # Calculate BASIC selector health
            basic_problematic_selectors = []
            basic_good_selectors = 0
            basic_bad_selectors = 0
            
            for field, stats in basic_field_stats.items():
                if stats['success_rate'] < 50:
                    basic_problematic_selectors.append(field)
                    basic_bad_selectors += 1
                else:
                    basic_good_selectors += 1
            
            total_basic_selectors = len(basic_selector_fields)
            basic_selector_health = (basic_good_selectors / total_basic_selectors * 100) if total_basic_selectors > 0 else 0
            
            # Calculate OVERALL selector health (all fields)
            all_problematic_selectors = []
            all_good_selectors = 0
            all_bad_selectors = 0
            
            for field, stats in all_field_stats.items():
                if stats['success_rate'] < 50:
                    all_problematic_selectors.append(field)
                    all_bad_selectors += 1
                else:
                    all_good_selectors += 1
            
            total_all_selectors = len(all_fields)
            overall_selector_health = (all_good_selectors / total_all_selectors * 100) if total_all_selectors > 0 else 0
            
            # Count complete ads (all critical fields populated)
            critical_fields = ['price', 'square_footage', 'city', 'title']
            complete_ads = 0
            incomplete_ads = 0
            
            for ad in ads:
                is_complete = all([
                    getattr(ad, field) not in [None, '', 0, '0']
                    for field in critical_fields
                ])
                if is_complete:
                    complete_ads += 1
                else:
                    incomplete_ads += 1
            
            # Overall performance
            performance = (complete_ads / total_pages * 100) if total_pages > 0 else 0
            success_rate = (parsed_pages / total_pages * 100) if total_pages > 0 else 0
            
            # Error breakdown
            by_type = []
            if unparsed_pages > 0:
                by_type.append({'error_type': 'Not Parsed', 'count': unparsed_pages})
            if incomplete_ads > 0:
                by_type.append({'error_type': 'Incomplete Data', 'count': incomplete_ads})
            
            # Recent errors
            recent = []
            recent_incomplete = ads.filter(incomplete_data=True).order_by('-created_at')[:5]
            for ad in recent_incomplete:
                missing = []
                for field in critical_fields:
                    if getattr(ad, field) in [None, '', 0, '0']:
                        missing.append(field)
                
                recent.append({
                    'page_id': ad.id,
                    'url': ad.url[:100],
                    'error': f'Missing fields: {", ".join(missing)}',
                    'error_type': 'Incomplete Data',
                    'timestamp': ad.created_at.isoformat() if ad.created_at else None
                })
            
            # ===== DESCRIPTION SCRAPER ANALYSIS (ONLY ADS WHERE IT FOUND VARIABLES) =====
            # Only analyze impact on ads where description scraper actually found at least 1 variable
            ads_with_description_scraping = ads.exclude(description_scraped_variables__isnull=True).exclude(description_scraped_variables='')
            total_with_desc_scraper = ads_with_description_scraping.count()
            
            # Count fields enriched by description scraper (only for ads with variables found)
            desc_enriched_fields = {}
            total_enrichments = 0
            ads_with_findings = 0  # Count of ads where desc scraper found at least 1 field
            
            for ad in ads_with_description_scraping:
                if ad.description_scraped_variables:
                    # Parse comma-separated field names
                    enriched = [f.strip() for f in ad.description_scraped_variables.split(',') if f.strip()]
                    if len(enriched) > 0:  # Only count if it found at least 1 field
                        ads_with_findings += 1
                        total_enrichments += len(enriched)
                        for field_name in enriched:
                            desc_enriched_fields[field_name] = desc_enriched_fields.get(field_name, 0) + 1
            
            # Calculate performance WITHOUT description scraper (ONLY for ads where it found variables)
            # (count ads that would be complete using only manual selectors)
            manual_only_complete_in_affected = 0
            description_boost_count = 0  # Ads made complete (critical fields filled)
            total_ads_with_any_enrichment = 0  # Total ads that got ANY field enriched
            
            for ad in ads_with_description_scraping:
                if not ad.description_scraped_variables:
                    continue
                    
                enriched = [f.strip() for f in ad.description_scraped_variables.split(',') if f.strip()]
                if len(enriched) == 0:  # Skip ads where desc scraper found nothing
                    continue
                
                # Count this ad as enriched (it got at least 1 field)
                total_ads_with_any_enrichment += 1
                
                # Check if ad has all critical fields from manual parser only
                manual_complete = True
                desc_scraped_criticals = enriched
                
                for field in critical_fields:
                    field_value = getattr(ad, field, None)
                    has_value = field_value not in [None, '', 0, '0']
                    
                    # If field has value but was scraped from description, it's not from manual parser
                    if has_value and field in desc_scraped_criticals:
                        manual_complete = False
                        break
                    elif not has_value:
                        manual_complete = False
                        break
                
                if manual_complete:
                    manual_only_complete_in_affected += 1
                elif all([getattr(ad, field) not in [None, '', 0, '0'] for field in critical_fields]):
                    # Ad is complete now but wasn't complete with manual only
                    description_boost_count += 1
            
            # Calculate boost percentage ONLY for ads where desc scraper found variables
            manual_only_performance_affected = (manual_only_complete_in_affected / ads_with_findings * 100) if ads_with_findings > 0 else 0
            complete_in_affected = manual_only_complete_in_affected + description_boost_count
            complete_performance_affected = (complete_in_affected / ads_with_findings * 100) if ads_with_findings > 0 else 0
            desc_boost_percentage = complete_performance_affected - manual_only_performance_affected
            
            # Calculate average enrichments per ad (only ads where it found something)
            avg_enrichments = (total_enrichments / ads_with_findings) if ads_with_findings > 0 else 0
            
            # Sort enriched fields by frequency
            top_enriched_fields = sorted(desc_enriched_fields.items(), key=lambda x: x[1], reverse=True)[:10]
            
            # Recent activity
            now = timezone.now()
            one_hour_ago = now - timedelta(hours=1)
            one_day_ago = now - timedelta(days=1)
            two_days_ago = now - timedelta(days=2)
            
            pages_last_hour = ads.filter(created_at__gte=one_hour_ago).count()
            pages_last_24h = ads.filter(created_at__gte=one_day_ago).count()
            pages_prev_24h = ads.filter(
                created_at__gte=two_days_ago,
                created_at__lt=one_day_ago
            ).count()
            
            if pages_last_24h > pages_prev_24h * 1.1:
                trend = 'increasing'
            elif pages_last_24h < pages_prev_24h * 0.9:
                trend = 'decreasing'
            else:
                trend = 'stable'
            
            return {
                'domain': domain,
                'performance_percentage': round(performance, 2),
                'manual_only_performance': round(manual_only_performance_affected, 2),
                'with_desc_scraper_performance': round(complete_performance_affected, 2),
                'description_boost': round(desc_boost_percentage, 2),
                'ads_with_desc_findings': ads_with_findings,
                'total_ads_checked': total_pages,
                'basic_selector_analysis': {
                    'good_selectors': basic_good_selectors,
                    'bad_selectors': basic_bad_selectors,
                    'total_selectors': total_basic_selectors,
                    'selector_health_percentage': round(basic_selector_health, 2),
                    'problematic_selectors': basic_problematic_selectors[:10],
                    'field_details': basic_field_stats
                },
                'overall_selector_analysis': {
                    'good_selectors': all_good_selectors,
                    'bad_selectors': all_bad_selectors,
                    'total_selectors': total_all_selectors,
                    'selector_health_percentage': round(overall_selector_health, 2),
                    'problematic_selectors': all_problematic_selectors[:20],
                    'field_details': all_field_stats
                },
                'page_analysis': {
                    'total_pages': total_pages,
                    'parsed_pages': parsed_pages,
                    'unparsed_pages': unparsed_pages,
                    'complete_ads': complete_ads,
                    'incomplete_ads': incomplete_ads,
                    'success_rate': round(success_rate, 2)
                },
                'description_scraper_analysis': {
                    'total_ads_with_description_scraping': total_with_desc_scraper,
                    'total_ads_enriched': total_ads_with_any_enrichment,  # Total ads that got ANY field
                    'ads_made_complete': description_boost_count,  # Ads that went from incomplete to complete
                    'manual_only_complete_ads': manual_only_complete_in_affected,
                    'total_enrichments': total_enrichments,
                    'avg_enrichments_per_ad': round(avg_enrichments, 2),
                    'top_enriched_fields': [{'field': field, 'count': count} for field, count in top_enriched_fields],
                    'usage_percentage': round((total_with_desc_scraper / ads.count() * 100) if ads.count() > 0 else 0, 2)
                },
                'error_breakdown': {
                    'by_type': by_type,
                    'recent_errors': recent
                },
                'recent_activity': {
                    'pages_last_hour': pages_last_hour,
                    'pages_last_24h': pages_last_24h,
                    'pages_prev_24h': pages_prev_24h,
                    'trend': trend
                },
                'timestamp': timezone.now().isoformat()
            }
        except Exception as e:
            return {
                'domain': domain,
                'error': str(e),
                'timestamp': timezone.now().isoformat()
            }
    
    def get_live_feed(self, limit=50):
        """Get real-time feed of recently processed ads."""
        try:
            # Critical fields to check
            critical_fields = ['price', 'square_footage', 'city', 'title', 'rooms']
            
            # Get recent ads ordered by creation time with only necessary fields
            recent_ads = AdsManual.objects.only(
                'id', 'url', 'created_at', 'price', 'square_footage', 'city', 'title', 'rooms',
                'estate_type', 'offer_type', 'description', 'address', 'floor', 'floors_num',
                'bathrooms', 'lat', 'lon', 'description_scraped_variables'
            ).order_by('-created_at')[:limit]
            
            ads_list = []
            completion_rates = []
            portals_active = set()
            
            # Calculate processing rate (ads processed in last minute)
            one_minute_ago = timezone.now() - timedelta(minutes=1)
            ads_last_minute = AdsManual.objects.filter(created_at__gte=one_minute_ago).count()
            
            for ad in recent_ads:
                # Count extracted fields
                fields_extracted = 0
                fields_total = 13  # Total fields we track
                
                # Check each field
                field_checks = {
                    'price': bool(ad.price),
                    'square_footage': bool(ad.square_footage),
                    'city': bool(ad.city),
                    'title': bool(ad.title),
                    'rooms': bool(ad.rooms),
                    'estate_type': bool(ad.estate_type),
                    'offer_type': bool(ad.offer_type),
                    'description': bool(ad.description),
                    'address': bool(ad.address),
                    'floor': ad.floor is not None,
                    'floors_num': ad.floors_num is not None,
                    'bathrooms': ad.bathrooms is not None,
                    'lat_lon': bool(ad.lat and ad.lon)
                }
                
                fields_extracted = sum(1 for v in field_checks.values() if v)
                completion_rate = (fields_extracted / fields_total) * 100
                completion_rates.append(completion_rate)
                
                # Check critical fields
                critical_present = all([
                    field_checks.get(field, False) for field in critical_fields
                ])
                
                # Check if description scraper was used
                has_description_scraper = bool(ad.description_scraped_variables)
                
                # Get portal name through the reverse relationship
                # NetworkMonitoredPage has network_ad_manual pointing to AdsManual
                try:
                    monitored_page = NetworkMonitoredPage.objects.filter(network_ad_manual=ad).first()
                    if monitored_page and monitored_page.source:
                        portal = monitored_page.source.name
                    else:
                        portal = monitored_page.name if monitored_page else 'Unknown'
                except Exception:
                    portal = 'Unknown'
                    
                portals_active.add(portal)
                
                # Calculate time since processing (approximate)
                seconds_ago = (timezone.now() - ad.created_at).total_seconds()
                
                ads_list.append({
                    'id': ad.id,
                    'url': ad.url,
                    'portal': portal,
                    'processed_at': ad.created_at.isoformat(),
                    'seconds_ago': int(seconds_ago),
                    'fields_extracted': fields_extracted,
                    'fields_total': fields_total,
                    'completion_rate': round(completion_rate, 1),
                    'has_description_scraper': has_description_scraper,
                    'critical_fields_present': critical_present,
                    'status': 'complete' if critical_present else ('good' if completion_rate >= 70 else 'incomplete'),
                    'enriched_fields': ad.description_scraped_variables.split(',') if ad.description_scraped_variables else []
                })
            
            # Calculate summary metrics
            avg_completion = sum(completion_rates) / len(completion_rates) if completion_rates else 0
            
            return {
                'ads': ads_list,
                'summary': {
                    'total_processed': len(ads_list),
                    'avg_completion_rate': round(avg_completion, 1),
                    'processing_rate_per_minute': ads_last_minute,
                    'active_portals': len(portals_active),
                    'portals_list': list(portals_active)
                },
                'timestamp': timezone.now().isoformat()
            }
        except Exception as e:
            return {
                'ads': [],
                'summary': {
                    'total_processed': 0,
                    'avg_completion_rate': 0,
                    'processing_rate_per_minute': 0,
                    'active_portals': 0,
                    'portals_list': []
                },
                'error': str(e),
                'timestamp': timezone.now().isoformat()
            }
