"""
Polish Text Normalizer
Utilities for normalizing Polish text, handling diacritics, and common patterns.
"""

import re
from typing import Dict, Optional


class PolishNormalizer:
    """
    Normalizes Polish text including diacritics, currency, and common abbreviations.
    """
    
    # Polish diacritics mapping
    DIACRITICS_MAP = {
        'ą': 'a', 'ć': 'c', 'ę': 'e', 'ł': 'l', 'ń': 'n',
        'ó': 'o', 'ś': 's', 'ź': 'z', 'ż': 'z',
        'Ą': 'A', 'Ć': 'C', 'Ę': 'E', 'Ł': 'L', 'Ń': 'N',
        'Ó': 'O', 'Ś': 'S', 'Ź': 'Z', 'Ż': 'Z',
    }
    
    # Common Polish abbreviations
    ABBREVIATIONS = {
        'ul.': 'ulica',
        'al.': 'aleja',
        'pl.': 'plac',
        'os.': 'osiedle',
        'm.': 'mieszkanie',
        'pok.': 'pokoje',
        'pow.': 'powierzchnia',
        'woj.': 'województwo',
    }
    
    # Currency patterns
    CURRENCY_PATTERNS = {
        'zł': 'PLN',
        'zl': 'PLN',
        'PLN': 'PLN',
        '€': 'EUR',
        'EUR': 'EUR',
        '$': 'USD',
        'USD': 'USD',
    }
    
    @classmethod
    def remove_diacritics(cls, text: str) -> str:
        """
        Remove Polish diacritics from text.
        
        Args:
            text: Text with diacritics
            
        Returns:
            Text without diacritics
        """
        if not text:
            return ""
        
        result = []
        for char in text:
            result.append(cls.DIACRITICS_MAP.get(char, char))
        
        return ''.join(result)
    
    @classmethod
    def normalize_abbreviations(cls, text: str) -> str:
        """
        Expand common Polish abbreviations.
        
        Args:
            text: Text with abbreviations
            
        Returns:
            Text with expanded abbreviations
        """
        if not text:
            return ""
        
        result = text
        for abbr, full in cls.ABBREVIATIONS.items():
            result = re.sub(r'\b' + re.escape(abbr) + r'\b', full, result, flags=re.IGNORECASE)
        
        return result
    
    @classmethod
    def normalize_currency(cls, text: str) -> str:
        """
        Normalize currency symbols to standard codes.
        
        Args:
            text: Text with currency symbols
            
        Returns:
            Text with normalized currency codes
        """
        if not text:
            return ""
        
        result = text
        for symbol, code in cls.CURRENCY_PATTERNS.items():
            result = result.replace(symbol, code)
        
        return result
    
    @classmethod
    def extract_price(cls, text: str) -> Optional[Dict[str, any]]:
        """
        Extract price and currency from text.
        
        Args:
            text: Text containing price information
            
        Returns:
            Dict with 'amount' and 'currency' or None
        """
        if not text:
            return None
        
        # Pattern: number followed by currency
        pattern = r'(\d+(?:[\s.,]\d+)*)\s*(zł|PLN|EUR|€|USD|\$)'
        match = re.search(pattern, text, re.IGNORECASE)
        
        if match:
            amount_str = match.group(1).replace(' ', '').replace(',', '.')
            currency_raw = match.group(2)
            
            # Normalize currency
            currency = cls.CURRENCY_PATTERNS.get(currency_raw, 'PLN')
            
            try:
                amount = float(amount_str)
                return {
                    'amount': amount,
                    'currency': currency
                }
            except ValueError:
                return None
        
        return None
    
    @classmethod
    def normalize_text(cls, text: str, 
                      remove_diacritics: bool = False,
                      expand_abbreviations: bool = True,
                      normalize_currency: bool = True) -> str:
        """
        Apply all normalization steps to text.
        
        Args:
            text: Input text
            remove_diacritics: Whether to remove diacritics
            expand_abbreviations: Whether to expand abbreviations
            normalize_currency: Whether to normalize currency symbols
            
        Returns:
            Normalized text
        """
        if not text:
            return ""
        
        result = text
        
        if expand_abbreviations:
            result = cls.normalize_abbreviations(result)
        
        if normalize_currency:
            result = cls.normalize_currency(result)
        
        if remove_diacritics:
            result = cls.remove_diacritics(result)
        
        # Clean up excessive whitespace
        result = re.sub(r'\s+', ' ', result).strip()
        
        return result
    
    @classmethod
    def normalize_address(cls, address: str) -> str:
        """
        Normalize Polish address format.
        
        Args:
            address: Raw address string
            
        Returns:
            Normalized address
        """
        if not address:
            return ""
        
        # Expand abbreviations
        result = cls.normalize_abbreviations(address)
        
        # Standardize spacing around commas
        result = re.sub(r'\s*,\s*', ', ', result)
        
        # Clean up whitespace
        result = re.sub(r'\s+', ' ', result).strip()
        
        return result
    
    @classmethod
    def extract_area(cls, text: str) -> Optional[float]:
        """
        Extract area (square meters) from text.
        
        Args:
            text: Text containing area information
            
        Returns:
            Area in square meters or None
        """
        if not text:
            return None
        
        # Pattern: number followed by m², m2, mkw, etc.
        pattern = r'(\d+(?:[.,]\d+)?)\s*(?:m²|m2|mkw|metr[óo]w)'
        match = re.search(pattern, text, re.IGNORECASE)
        
        if match:
            area_str = match.group(1).replace(',', '.')
            try:
                return float(area_str)
            except ValueError:
                return None
        
        return None
    
    @classmethod
    def extract_rooms(cls, text: str) -> Optional[int]:
        """
        Extract number of rooms from text.
        
        Args:
            text: Text containing room information
            
        Returns:
            Number of rooms or None
        """
        if not text:
            return None
        
        # Pattern: number followed by room indicators
        pattern = r'(\d+)\s*(?:pok[o.]|pokoi|pokoje|room)'
        match = re.search(pattern, text, re.IGNORECASE)
        
        if match:
            try:
                return int(match.group(1))
            except ValueError:
                return None
        
        return None
