
    wi                     4    d Z ddlZddlmZmZ  G d d      Zy)zi
Polish Text Normalizer
Utilities for normalizing Polish text, handling diacritics, and common patterns.
    N)DictOptionalc                      e Zd ZdZi dddddddd	d
ddddddddddddddddddddddd d!d"d#d"iZd$d%d&d'd(d)d*d+d,Zd-d-d-d.d.d/d/d0Zed1ed2efd3       Z	ed1ed2efd4       Z
ed1ed2efd5       Zed1ed2eeeef      fd6       Ze	 	 	 d@d1ed7ed8ed9ed2ef
d:       Zed;ed2efd<       Zed1ed2ee   fd=       Zed1ed2ee   fd>       Zy?)APolishNormalizerzZ
    Normalizes Polish text including diacritics, currency, and common abbreviations.
    u   ąau   ćcu   ęeu   łlu   ńn   óou   śsu   źzu   żu   ĄAu   ĆCu   ĘEu   ŁLu   ŃN   ÓOu   ŚSu   ŹZu   Żulicaalejaplacosiedle
mieszkaniepokojepowierzchniau   województwo)zul.zal.zpl.zos.zm.zpok.zpow.zwoj.PLNEURUSD)u   złzlr    u   €r!   $r"   textreturnc                     |syg }|D ]-  }|j                  | j                  j                  ||             / dj                  |      S )z
        Remove Polish diacritics from text.
        
        Args:
            text: Text with diacritics
            
        Returns:
            Text without diacritics
         )appendDIACRITICS_MAPgetjoin)clsr%   resultchars       S/var/www/extractly/manual_agregator/description_extrator/utils/polish_normalizer.pyremove_diacriticsz"PolishNormalizer.remove_diacritics.   sH     DMM#,,00t<=  wwv    c                     |sy|}| j                   j                         D ]E  \  }}t        j                  dt        j                  |      z   dz   ||t        j
                        }G |S )z
        Expand common Polish abbreviations.
        
        Args:
            text: Text with abbreviations
            
        Returns:
            Text with expanded abbreviations
        r(   z\b)flags)ABBREVIATIONSitemsresubescape
IGNORECASE)r-   r%   r.   abbrfulls        r0   normalize_abbreviationsz(PolishNormalizer.normalize_abbreviationsB   s`     ++113JD$VVEBIIdO3e;T6QSQ^Q^_F 4 r2   c                 x    |sy|}| j                   j                         D ]  \  }}|j                  ||      } |S )z
        Normalize currency symbols to standard codes.
        
        Args:
            text: Text with currency symbols
            
        Returns:
            Text with normalized currency codes
        r(   )CURRENCY_PATTERNSr6   replace)r-   r%   r.   symbolcodes        r0   normalize_currencyz#PolishNormalizer.normalize_currencyV   sC     11779LFD^^FD1F : r2   c                 X   |syd}t        j                  ||t         j                        }|ro|j                  d      j	                  dd      j	                  dd      }|j                  d      }| j
                  j                  |d	      }	 t        |      }||d
S y# t        $ r Y yw xY w)z
        Extract price and currency from text.
        
        Args:
            text: Text containing price information
            
        Returns:
            Dict with 'amount' and 'currency' or None
        Nu.   (\d+(?:[\s.,]\d+)*)\s*(zł|PLN|EUR|€|USD|\$)    r(   ,.   r    )amountcurrency)	r7   searchr:   groupr@   r?   r+   float
ValueError)r-   r%   patternmatch
amount_strcurrency_rawrK   rJ   s           r0   extract_pricezPolishNormalizer.extract_pricej   s      D		'47Q//R8@@cJJ ;;q>L ,,00uEHz*$ (    s   B 	B)(B)r1   expand_abbreviationsrC   c                     |sy|}|r| j                  |      }|r| j                  |      }|r| j                  |      }t        j                  dd|      j                         }|S )aj  
        Apply all normalization steps to text.
        
        Args:
            text: Input text
            remove_diacritics: Whether to remove diacritics
            expand_abbreviations: Whether to expand abbreviations
            normalize_currency: Whether to normalize currency symbols
            
        Returns:
            Normalized text
        r(   \s+rF   )r=   rC   r1   r7   r8   strip)r-   r%   r1   rU   rC   r.   s         r0   normalize_textzPolishNormalizer.normalize_text   sj    " 008F++F3F**62F V,224r2   addressc                     |sy| j                  |      }t        j                  dd|      }t        j                  dd|      j                         }|S )z
        Normalize Polish address format.
        
        Args:
            address: Raw address string
            
        Returns:
            Normalized address
        r(   z\s*,\s*z, rW   rF   )r=   r7   r8   rX   )r-   rZ   r.   s      r0   normalize_addressz"PolishNormalizer.normalize_address   sR      ,,W5 
D&1 V,224r2   c                     |syd}t        j                  ||t         j                        }|r-|j                  d      j	                  dd      }	 t        |      S y# t        $ r Y yw xY w)z
        Extract area (square meters) from text.
        
        Args:
            text: Text containing area information
            
        Returns:
            Area in square meters or None
        Nu-   (\d+(?:[.,]\d+)?)\s*(?:m²|m2|mkw|metr[óo]w)rE   rG   rH   )r7   rL   r:   rM   r@   rN   rO   )r-   r%   rP   rQ   area_strs        r0   extract_areazPolishNormalizer.extract_area   sm      C		'47{{1~--c37HX&   s   
A 	A'&A'c                     |syd}t        j                  ||t         j                        }|r	 t        |j	                  d            S y# t
        $ r Y yw xY w)z
        Extract number of rooms from text.
        
        Args:
            text: Text containing room information
            
        Returns:
            Number of rooms or None
        Nz%(\d+)\s*(?:pok[o.]|pokoi|pokoje|room)rE   )r7   rL   r:   intrM   rO   )r-   r%   rP   rQ   s       r0   extract_roomszPolishNormalizer.extract_rooms   s[      ;		'475;;q>**   s   A	 		AAN)FTT)__name__
__module____qualname____doc__r*   r5   r?   classmethodstrr1   r=   rC   r   r   anyrT   boolrY   r\   rN   r_   ra   rb    r2   r0   r   r   
   sa   
c"C)-s48#c"C)-s 	c  #C *.s 59# 	c	 	 #C	 *.s	N 	M  S S  & 3 3  & c c  & ! !$sCx.)A ! !F 053715!# !)-!,0! +/! ;>! !F    0    4  #  r2   r   )rf   r7   typingr   r   r   rk   r2   r0   <module>rm      s   
 
 !t tr2   