""" Django management command to provide real-time analytics data. Usage: python manage.py realtime_data """ from django.core.management.base import BaseCommand from django.utils import timezone from django.db.models import Count, Q, F from extractly.models import AdsManual, NetworkMonitoredPage, SourceNetwork import json from datetime import timedelta from urllib.parse import urlparse class Command(BaseCommand): help = 'Provides real-time analytics data for the monitoring UI' def add_arguments(self, parser): parser.add_argument( 'action', type=str, choices=['server-status', 'overall-performance', 'portal-details', 'live-feed'], help='Type of data to retrieve' ) parser.add_argument( '--domain', type=str, help='Portal domain (required for portal-details action)' ) parser.add_argument( '--limit', type=int, default=50, help='Number of recent items to fetch (for live-feed action)' ) def handle(self, *args, **options): action = options['action'] try: if action == 'server-status': data = self.get_server_status() elif action == 'overall-performance': data = self.get_overall_performance() elif action == 'portal-details': domain = options.get('domain') if not domain: data = {'error': 'Domain parameter required for portal-details'} else: data = self.get_portal_details(domain) elif action == 'live-feed': limit = options.get('limit', 50) data = self.get_live_feed(limit) else: data = {'error': 'Unknown action'} self.stdout.write(json.dumps(data, default=str)) except Exception as e: self.stdout.write(json.dumps({'error': str(e)})) def get_server_status(self): """Check if server is actively updating data.""" try: # Check latest AdsManual entry latest_ad = AdsManual.objects.all().order_by('-created_at').first() if not latest_ad: return { 'is_live': False, 'last_update': None, 'minutes_since_update': None, 'message': 'No data found in database' } last_update = latest_ad.created_at now = timezone.now() time_diff = now - last_update minutes_since = int(time_diff.total_seconds() / 60) update_threshold = 15 is_live = minutes_since < update_threshold return { 'is_live': is_live, 'last_update': last_update.isoformat(), 'minutes_since_update': minutes_since, 'message': 'Server is live and updating' if is_live else f'Server offline - Last update {minutes_since} minutes ago' } except Exception as e: return { 'is_live': False, 'last_update': None, 'minutes_since_update': None, 'message': f'Error: {str(e)}' } def get_overall_performance(self): """Get performance metrics for all portals using NetworkMonitoredPage and AdsManual.""" try: # Get all sources (portals) sources = SourceNetwork.objects.filter(enabled=True) portal_list = [] performances = [] for source in sources: domain = urlparse(source.base_url).netloc # Optimized: Use aggregate for counts instead of .count() queries page_stats = NetworkMonitoredPage.objects.filter(source=source).aggregate( total=Count('id'), parsed=Count('id', filter=Q(network_ad_manual__isnull=False)) ) total_pages = page_stats['total'] or 0 parsed_pages = page_stats['parsed'] or 0 # Get AdsManual entries for this portal - use aggregation for performance # Calculate selector health by checking for null/empty critical fields text_fields = ['city', 'title'] numeric_fields = ['price', 'square_footage', 'rooms'] # Build Q object for complete ads (all critical fields have values) complete_condition = Q() # For text fields: not null and not empty string for field in text_fields: complete_condition &= ~Q(**{field: None}) & ~Q(**{field: ''}) # For numeric fields: not null and greater than 0 for field in numeric_fields: complete_condition &= ~Q(**{field: None}) & Q(**{f'{field}__gt': 0}) ad_stats = AdsManual.objects.filter( networkmonitoredpage__source=source ).aggregate( total=Count('id', distinct=True), complete=Count('id', filter=complete_condition, distinct=True) ) total_ads = ad_stats['total'] or 0 complete_ads = ad_stats['complete'] or 0 # Performance = (complete ads / total monitored pages) * 100 performance = (complete_ads / total_pages * 100) if total_pages > 0 else 0 # Success rate = (parsed pages / total pages) * 100 success_rate = (parsed_pages / total_pages * 100) if total_pages > 0 else 0 # Calculate basic fields percentage (14 main fields) basic_fields = ['price', 'square_footage', 'city', 'title', 'rooms', 'estate_type', 'offer_type', 'description', 'address', 'floor', 'floors_num', 'bathrooms', 'lat', 'lon'] basic_complete_count = 0 for ad in AdsManual.objects.filter(networkmonitoredpage__source=source): has_all_basic = True for field in basic_fields: value = getattr(ad, field, None) if value in [None, '', 0, '0']: has_all_basic = False break if has_all_basic: basic_complete_count += 1 basic_performance = (basic_complete_count / total_pages * 100) if total_pages > 0 else 0 if performance >= 90: status = 'excellent' elif performance >= 70: status = 'good' elif performance >= 50: status = 'fair' else: status = 'poor' portal_data = { 'domain': domain, 'total_pages': total_pages, 'parsed_pages': parsed_pages, 'unparsed_pages': total_pages - parsed_pages, 'complete_ads': complete_ads, 'incomplete_ads': parsed_pages - complete_ads, 'performance_percentage': round(performance, 2), 'basic_fields_percentage': round(basic_performance, 2), 'success_rate': round(success_rate, 2), 'status': status } portal_list.append(portal_data) performances.append(performance) # Sort by total pages descending portal_list.sort(key=lambda x: x['total_pages'], reverse=True) avg_performance = sum(performances) / len(performances) if performances else 0 best_portal = max(portal_list, key=lambda x: x['performance_percentage']) if portal_list else None worst_portal = min(portal_list, key=lambda x: x['performance_percentage']) if portal_list else None return { 'portals': portal_list, 'summary': { 'total_portals': len(portal_list), 'avg_performance': round(avg_performance, 2), 'best_portal': best_portal['domain'] if best_portal else None, 'worst_portal': worst_portal['domain'] if worst_portal else None }, 'timestamp': timezone.now().isoformat() } except Exception as e: return { 'portals': [], 'summary': { 'total_portals': 0, 'avg_performance': 0, 'best_portal': None, 'worst_portal': None }, 'error': str(e), 'timestamp': timezone.now().isoformat() } def get_portal_details(self, domain): """Get detailed analysis for a specific portal.""" try: # Find the source by domain - try multiple matching strategies from urllib.parse import urlparse # Try exact domain match in base_url source = SourceNetwork.objects.filter(base_url__icontains=domain).first() # If not found, try matching just the domain part if not source: for src in SourceNetwork.objects.filter(enabled=True): src_domain = urlparse(src.base_url).netloc if domain.lower() in src_domain.lower() or src_domain.lower() in domain.lower(): source = src break if not source: return { 'domain': domain, 'error': f'Portal not found. Searched for: {domain}', 'timestamp': timezone.now().isoformat() } # Total monitored pages for this portal total_pages = NetworkMonitoredPage.objects.filter(source=source).count() # Pages successfully parsed (have AdsManual entry) parsed_pages = NetworkMonitoredPage.objects.filter( source=source, network_ad_manual__isnull=False ).count() unparsed_pages = total_pages - parsed_pages # Get all AdsManual entries for this portal ads = AdsManual.objects.filter( networkmonitoredpage__source=source ).distinct() # Define main/basic important selectors/fields to check basic_selector_fields = [ 'price', 'square_footage', 'city', 'title', 'rooms', 'estate_type', 'offer_type', 'description', 'address', 'floor', 'floors_num', 'bathrooms', 'lat', 'lon' ] # Get ALL fields from AdsManual model (excluding system fields) all_fields = [] excluded_fields = ['id', 'created_at', 'updated_at', 'incomplete_data', 'network_ad_manual'] for field in AdsManual._meta.get_fields(): if field.name not in excluded_fields and not field.many_to_many and not field.one_to_many: all_fields.append(field.name) # Analyze BASIC fields basic_field_stats = {} for field in basic_selector_fields: populated_count = 0 for ad in ads: value = getattr(ad, field, None) if value not in [None, '', 0, '0', []]: populated_count += 1 total_ads = ads.count() success_rate = (populated_count / total_ads * 100) if total_ads > 0 else 0 basic_field_stats[field] = { 'populated': populated_count, 'missing': total_ads - populated_count, 'success_rate': success_rate } # Analyze ALL fields all_field_stats = {} for field in all_fields: populated_count = 0 for ad in ads: try: value = getattr(ad, field, None) if value not in [None, '', 0, '0', []]: populated_count += 1 except Exception: continue total_ads = ads.count() success_rate = (populated_count / total_ads * 100) if total_ads > 0 else 0 all_field_stats[field] = { 'populated': populated_count, 'missing': total_ads - populated_count, 'success_rate': success_rate } # Calculate BASIC selector health basic_problematic_selectors = [] basic_good_selectors = 0 basic_bad_selectors = 0 for field, stats in basic_field_stats.items(): if stats['success_rate'] < 50: basic_problematic_selectors.append(field) basic_bad_selectors += 1 else: basic_good_selectors += 1 total_basic_selectors = len(basic_selector_fields) basic_selector_health = (basic_good_selectors / total_basic_selectors * 100) if total_basic_selectors > 0 else 0 # Calculate OVERALL selector health (all fields) all_problematic_selectors = [] all_good_selectors = 0 all_bad_selectors = 0 for field, stats in all_field_stats.items(): if stats['success_rate'] < 50: all_problematic_selectors.append(field) all_bad_selectors += 1 else: all_good_selectors += 1 total_all_selectors = len(all_fields) overall_selector_health = (all_good_selectors / total_all_selectors * 100) if total_all_selectors > 0 else 0 # Count complete ads (all critical fields populated) critical_fields = ['price', 'square_footage', 'city', 'title'] complete_ads = 0 incomplete_ads = 0 for ad in ads: is_complete = all([ getattr(ad, field) not in [None, '', 0, '0'] for field in critical_fields ]) if is_complete: complete_ads += 1 else: incomplete_ads += 1 # Overall performance performance = (complete_ads / total_pages * 100) if total_pages > 0 else 0 success_rate = (parsed_pages / total_pages * 100) if total_pages > 0 else 0 # Error breakdown by_type = [] if unparsed_pages > 0: by_type.append({'error_type': 'Not Parsed', 'count': unparsed_pages}) if incomplete_ads > 0: by_type.append({'error_type': 'Incomplete Data', 'count': incomplete_ads}) # Recent errors recent = [] recent_incomplete = ads.filter(incomplete_data=True).order_by('-created_at')[:5] for ad in recent_incomplete: missing = [] for field in critical_fields: if getattr(ad, field) in [None, '', 0, '0']: missing.append(field) recent.append({ 'page_id': ad.id, 'url': ad.url[:100], 'error': f'Missing fields: {", ".join(missing)}', 'error_type': 'Incomplete Data', 'timestamp': ad.created_at.isoformat() if ad.created_at else None }) # ===== DESCRIPTION SCRAPER ANALYSIS (ONLY ADS WHERE IT FOUND VARIABLES) ===== # Only analyze impact on ads where description scraper actually found at least 1 variable ads_with_description_scraping = ads.exclude(description_scraped_variables__isnull=True).exclude(description_scraped_variables='') total_with_desc_scraper = ads_with_description_scraping.count() # Count fields enriched by description scraper (only for ads with variables found) desc_enriched_fields = {} total_enrichments = 0 ads_with_findings = 0 # Count of ads where desc scraper found at least 1 field for ad in ads_with_description_scraping: if ad.description_scraped_variables: # Parse comma-separated field names enriched = [f.strip() for f in ad.description_scraped_variables.split(',') if f.strip()] if len(enriched) > 0: # Only count if it found at least 1 field ads_with_findings += 1 total_enrichments += len(enriched) for field_name in enriched: desc_enriched_fields[field_name] = desc_enriched_fields.get(field_name, 0) + 1 # Calculate performance WITHOUT description scraper (ONLY for ads where it found variables) # (count ads that would be complete using only manual selectors) manual_only_complete_in_affected = 0 description_boost_count = 0 # Ads made complete (critical fields filled) total_ads_with_any_enrichment = 0 # Total ads that got ANY field enriched for ad in ads_with_description_scraping: if not ad.description_scraped_variables: continue enriched = [f.strip() for f in ad.description_scraped_variables.split(',') if f.strip()] if len(enriched) == 0: # Skip ads where desc scraper found nothing continue # Count this ad as enriched (it got at least 1 field) total_ads_with_any_enrichment += 1 # Check if ad has all critical fields from manual parser only manual_complete = True desc_scraped_criticals = enriched for field in critical_fields: field_value = getattr(ad, field, None) has_value = field_value not in [None, '', 0, '0'] # If field has value but was scraped from description, it's not from manual parser if has_value and field in desc_scraped_criticals: manual_complete = False break elif not has_value: manual_complete = False break if manual_complete: manual_only_complete_in_affected += 1 elif all([getattr(ad, field) not in [None, '', 0, '0'] for field in critical_fields]): # Ad is complete now but wasn't complete with manual only description_boost_count += 1 # Calculate boost percentage ONLY for ads where desc scraper found variables manual_only_performance_affected = (manual_only_complete_in_affected / ads_with_findings * 100) if ads_with_findings > 0 else 0 complete_in_affected = manual_only_complete_in_affected + description_boost_count complete_performance_affected = (complete_in_affected / ads_with_findings * 100) if ads_with_findings > 0 else 0 desc_boost_percentage = complete_performance_affected - manual_only_performance_affected # Calculate average enrichments per ad (only ads where it found something) avg_enrichments = (total_enrichments / ads_with_findings) if ads_with_findings > 0 else 0 # Sort enriched fields by frequency top_enriched_fields = sorted(desc_enriched_fields.items(), key=lambda x: x[1], reverse=True)[:10] # Recent activity now = timezone.now() one_hour_ago = now - timedelta(hours=1) one_day_ago = now - timedelta(days=1) two_days_ago = now - timedelta(days=2) pages_last_hour = ads.filter(created_at__gte=one_hour_ago).count() pages_last_24h = ads.filter(created_at__gte=one_day_ago).count() pages_prev_24h = ads.filter( created_at__gte=two_days_ago, created_at__lt=one_day_ago ).count() if pages_last_24h > pages_prev_24h * 1.1: trend = 'increasing' elif pages_last_24h < pages_prev_24h * 0.9: trend = 'decreasing' else: trend = 'stable' return { 'domain': domain, 'performance_percentage': round(performance, 2), 'manual_only_performance': round(manual_only_performance_affected, 2), 'with_desc_scraper_performance': round(complete_performance_affected, 2), 'description_boost': round(desc_boost_percentage, 2), 'ads_with_desc_findings': ads_with_findings, 'total_ads_checked': total_pages, 'basic_selector_analysis': { 'good_selectors': basic_good_selectors, 'bad_selectors': basic_bad_selectors, 'total_selectors': total_basic_selectors, 'selector_health_percentage': round(basic_selector_health, 2), 'problematic_selectors': basic_problematic_selectors[:10], 'field_details': basic_field_stats }, 'overall_selector_analysis': { 'good_selectors': all_good_selectors, 'bad_selectors': all_bad_selectors, 'total_selectors': total_all_selectors, 'selector_health_percentage': round(overall_selector_health, 2), 'problematic_selectors': all_problematic_selectors[:20], 'field_details': all_field_stats }, 'page_analysis': { 'total_pages': total_pages, 'parsed_pages': parsed_pages, 'unparsed_pages': unparsed_pages, 'complete_ads': complete_ads, 'incomplete_ads': incomplete_ads, 'success_rate': round(success_rate, 2) }, 'description_scraper_analysis': { 'total_ads_with_description_scraping': total_with_desc_scraper, 'total_ads_enriched': total_ads_with_any_enrichment, # Total ads that got ANY field 'ads_made_complete': description_boost_count, # Ads that went from incomplete to complete 'manual_only_complete_ads': manual_only_complete_in_affected, 'total_enrichments': total_enrichments, 'avg_enrichments_per_ad': round(avg_enrichments, 2), 'top_enriched_fields': [{'field': field, 'count': count} for field, count in top_enriched_fields], 'usage_percentage': round((total_with_desc_scraper / ads.count() * 100) if ads.count() > 0 else 0, 2) }, 'error_breakdown': { 'by_type': by_type, 'recent_errors': recent }, 'recent_activity': { 'pages_last_hour': pages_last_hour, 'pages_last_24h': pages_last_24h, 'pages_prev_24h': pages_prev_24h, 'trend': trend }, 'timestamp': timezone.now().isoformat() } except Exception as e: return { 'domain': domain, 'error': str(e), 'timestamp': timezone.now().isoformat() } def get_live_feed(self, limit=50): """Get real-time feed of recently processed ads.""" try: # Critical fields to check critical_fields = ['price', 'square_footage', 'city', 'title', 'rooms'] # Get recent ads ordered by creation time with only necessary fields recent_ads = AdsManual.objects.only( 'id', 'url', 'created_at', 'price', 'square_footage', 'city', 'title', 'rooms', 'estate_type', 'offer_type', 'description', 'address', 'floor', 'floors_num', 'bathrooms', 'lat', 'lon', 'description_scraped_variables' ).order_by('-created_at')[:limit] ads_list = [] completion_rates = [] portals_active = set() # Calculate processing rate (ads processed in last minute) one_minute_ago = timezone.now() - timedelta(minutes=1) ads_last_minute = AdsManual.objects.filter(created_at__gte=one_minute_ago).count() for ad in recent_ads: # Count extracted fields fields_extracted = 0 fields_total = 13 # Total fields we track # Check each field field_checks = { 'price': bool(ad.price), 'square_footage': bool(ad.square_footage), 'city': bool(ad.city), 'title': bool(ad.title), 'rooms': bool(ad.rooms), 'estate_type': bool(ad.estate_type), 'offer_type': bool(ad.offer_type), 'description': bool(ad.description), 'address': bool(ad.address), 'floor': ad.floor is not None, 'floors_num': ad.floors_num is not None, 'bathrooms': ad.bathrooms is not None, 'lat_lon': bool(ad.lat and ad.lon) } fields_extracted = sum(1 for v in field_checks.values() if v) completion_rate = (fields_extracted / fields_total) * 100 completion_rates.append(completion_rate) # Check critical fields critical_present = all([ field_checks.get(field, False) for field in critical_fields ]) # Check if description scraper was used has_description_scraper = bool(ad.description_scraped_variables) # Get portal name through the reverse relationship # NetworkMonitoredPage has network_ad_manual pointing to AdsManual try: monitored_page = NetworkMonitoredPage.objects.filter(network_ad_manual=ad).first() if monitored_page and monitored_page.source: portal = monitored_page.source.name else: portal = monitored_page.name if monitored_page else 'Unknown' except Exception: portal = 'Unknown' portals_active.add(portal) # Calculate time since processing (approximate) seconds_ago = (timezone.now() - ad.created_at).total_seconds() ads_list.append({ 'id': ad.id, 'url': ad.url, 'portal': portal, 'processed_at': ad.created_at.isoformat(), 'seconds_ago': int(seconds_ago), 'fields_extracted': fields_extracted, 'fields_total': fields_total, 'completion_rate': round(completion_rate, 1), 'has_description_scraper': has_description_scraper, 'critical_fields_present': critical_present, 'status': 'complete' if critical_present else ('good' if completion_rate >= 70 else 'incomplete'), 'enriched_fields': ad.description_scraped_variables.split(',') if ad.description_scraped_variables else [] }) # Calculate summary metrics avg_completion = sum(completion_rates) / len(completion_rates) if completion_rates else 0 return { 'ads': ads_list, 'summary': { 'total_processed': len(ads_list), 'avg_completion_rate': round(avg_completion, 1), 'processing_rate_per_minute': ads_last_minute, 'active_portals': len(portals_active), 'portals_list': list(portals_active) }, 'timestamp': timezone.now().isoformat() } except Exception as e: return { 'ads': [], 'summary': { 'total_processed': 0, 'avg_completion_rate': 0, 'processing_rate_per_minute': 0, 'active_portals': 0, 'portals_list': [] }, 'error': str(e), 'timestamp': timezone.now().isoformat() }