
    :h >                       d Z ddlmZ ddlZddlZddlZddlZddlmZm	Z	 ddl
mZmZmZmZ ddlmZmZ  ej$                  ej&                          ej(                  e      Zh dZh d	Z ej2                  d
ej4                        Zh dZh dZddhZ ej2                  dej4                        ZddZ ddZ!ddZ"ddZ#ddZ$e G d d             Z%ddZ&y)u  
dynamic_parser.py
=================
Uniwersalny parser ogłoszeń HTML + wrapper `raw_data_cleaner()`.

Zwraca słownik:
    raw_text      – złączony tekst wszystkich sekcji
    image_links   – lista src grafik
    parse_data    – {
        description    … str  albo list[str]  (patrz sekcja „Opis”)
        status         … available / unavailable / reserved / None
        available_from … "YYYY-MM-DD" lub None
        <key:value>    … reszta cech
      }
    )annotationsN)	dataclassfield)DictListSequenceTuple)BeautifulSoupTag)level>   dummy-imagecookie-bannertracking-pixelplaceholder.jpggfxsvgcontact>      poleć   wróć   więcej   wysłana   zadzwoń   zdjęcia   udostępnij   zobacz więcej   pokaż na mapie   skontaktuj się   charakter poglądowy   wiadomość została   wysyłanie wiadomości   wróć udostępnij zapisz   wyślij kolejną wiadomość   powyższa oferta ma charakter   zgłoś błąd lub naruszenie2   wystąpił błąd w trakcie wysyłania wiadomoście-mailoferta nie stanowidodaj do ulubionychdodano do ulubionych cele przetwarzania i twoje prawa%administratorem danych osobowych jestmapaemailmniejdrukujzapiszreklamaobserwujz
^\s*opis\b>   	   dostępna	   dostępne	   dostępny>      zajęte	   wynajęte   niedostępne	sprzedanezarezerwowane
rezerwacjaz>(?:od|od dnia)\s*(\d{4}-\d{2}-\d{2}|\d{2}[./-]\d{2}[./-]\d{4})c                    	 t        j                  |       } | j                         j                  d      j                  d      j	                  dd      j                  d      S # t        $ r Y Xw xY w)N"'\ /)jsonloads	Exceptionstripreplacerstrip)links    9/var/www/extractly/html_agregator/utils/dynamic_parser.py
_clean_urlrJ   G   se    zz$ ::<c"((-55dB?FFsKK  s   A$ $	A0/A0c                L    t        j                  dd| j                               S )Nz\s{2,} )resubrE   txts    rI   _normalize_spacerQ   O   s    66)S#))+..    c                \    t        t        j                  d| j                                     S )Nu<   ^[A-ZĄĆĘŁŃÓŚŹŻa-ząćęłńóśźż0-9\s\-/]{2,30}$)boolrM   matchrE   rO   s    rI   _is_keyrV   S   s#    XZ]ZcZcZefggrR   c                    t        j                  d|       }|rJt        |j                  d            r0|j                  d      |j                  d      j	                         fS d S )NuN   ^(?P<k>[A-Za-zĄĆĘŁŃÓŚŹŻąćęłńóśźż][^:]{1,30})[\s:]+(?P<v>.+)$kv)rM   rU   rV   grouprE   )cellms     rI   _maybe_split_kvr]   W   sS    
Y[_	A 459NAGGCL!''#,,,./XTXXrR   c                z    t               g }}| D ])  }||vs|j                  |       |j                  |       + |S N)setaddappend)seqseenoutxs       rI   _uniquerg   ^   s;    r#DD=HHQKJJqM  JrR   c                  f   e Zd ZU ded<    ed      Zded<    ede      Zded	<    ede      Zded
<    ede      Z	ded<    ede
      Zded<    edd      Zded<    edd      Zded<    edd      Zded<   d Zd$dZd$dZd Zd%dZd%dZd%dZd Zd Zd Zd  Zd! Zd" Zd&d#Zy)'UniversalListingParserstrhtmlF)initr
   soup)rl   default_factory	List[str]image_links	raw_lineszList[Dict[str, Sequence[str]]]raw_sectionsDict[str, str]	key_valueN)rl   defaultzstr | List[str] | Nonedescriptionz
str | Nonestatusavailable_fromc                n   t        | j                  d      | _        | j                         | _        | j                         | _        | j                          | j                          | j                          | j                          | j                          | j                          | j                          y )Nzhtml.parser)r
   rk   rm   _collect_imagesrp   _collect_linesrq   _parse_structures_parse_free_text_fix_shifted_pairs_prune_garbage_keys_merge_sections_extract_description_detect_availability)selfs    rI   __post_init__z$UniversalListingParser.__post_init__|   s    !$))];	!113!002 !  "!!#!!#rR   c                   | j                   j                  d      D cg c]-  }|j                  d      rt        |j                  d            / }}|D cg c]  t	        fdt
        D              s c}S c c}w c c}w )Nimgsrcc              3  B   K   | ]  }|j                         v   y wr_   lower).0bls     rI   	<genexpr>z9UniversalListingParser._collect_images.<locals>.<genexpr>   s     D/C!qAGGI~/C   )rm   find_allgetrJ   anyIMAGE_LINK_BLOCKLIST)r   r   linksr   s      `rI   rz   z&UniversalListingParser._collect_images   s     yy))%0
0wwu~ swwu~&0 	 
 
!D/CDD u
 	



s   2A<!Bc                    | j                   j                  dd      j                  d      }|D cg c]'  r#t        fdt        D              st              ) c}S c c}w )N
TrE   c              3  B   K   | ]  }|j                         v   y wr_   r   )r   swr   s     rI   r   z8UniversalListingParser._collect_lines.<locals>.<genexpr>   s     J7IR1779_7Ir   )rm   get_textsplitr   RAW_TEXT_STOPWORDSrQ   )r   linesr   s     `rI   r{   z%UniversalListingParser._collect_lines   sd    		""4t"4::4@ 
J7IJJ Q
 	
 
s   ,A"c                   | j                   | j                  | j                  fD ]#  }	 | j                  j	                   |              % y # t
        $ r+}t        j                  d|j                  |       Y d }~Vd }~ww xY w)Nz%s failed: %s)	_parse_tables	_parse_dl_parse_divsrt   updaterD   loggerwarning__name__)r   fnexcs      rI   r|   z(UniversalListingParser._parse_structures   si    %%t~~t7G7GHBB%%bd+ I  BSAABs    A	A?!A::A?c           	        i d }}| j                   j                  d      D ]=  }|j                  d      D ]%  }|j                  ddg      D cg c]&  }|j                  d      r|j                  d      ( }}|sIt        |      dk(  rt	        |d         r|d	   ||d   <   d }st        |      d	k(  rAt        |d         }t        |      }|r|d	   ||d   <   d }t	        |      r|}|r|||<   d }t        |      d
k\  st        |d	d        dz  dk(  s|d   |d	d  }
}	t        |
d d d   |
d	d d         D ]  \  }}t	        |      s|||	 d| <    ( @ |S c c}w )NtabletrtdthTr      r         z - )rm   r   r   lenrV   rQ   r]   zip)r   re   orphanr   rowccellsr[   mergedtitlerestrX   rY   s                rI   r   z$UniversalListingParser._parse_tables   s   $VYY''0E~~d+ !\\4,77zzz- JJTJ*7  
  u:?wuQx'8$)!HCaM!F u:?+E!H5D,T2F)/F1I!% t}!% &*F!%  u:?s59~'9Q'>"'(E!"I4E #D1ItADqDz :1"1:45C5'QC 01 !;I , 1P 
Ms   +E#c                ^   i }| j                   j                  d      D ]  }t        |j                  d      |j                  d            D ]]  \  }}t        |j	                  dd      j                  d            }t        |j	                  dd            }t        |      sY|||<   _  |S )NdldtddrL   Tr   :)rm   r   r   rQ   r   rG   rV   )r   re   r   r   r   rX   rY   s          rI   r   z UniversalListingParser._parse_dl   s    ))$$T*Bbkk$/T1BCB$R[[D[%A%H%H%MN$R[[D[%AB1:CF	 D + 
rR   c                   i }| j                   j                  d      D ]  }|j                  D cg c]  }t        |t              s| }}t        |      dk(  s<t        |d   j                  dd      j                  d            }t        |d   j                  dd            }t        |      s|||<    |S c c}w )	Ndivr   r   rL   Tr   r   r   )
rm   r   children
isinstancer   r   rQ   r   rG   rV   )r   re   r   rX   kidsrY   s         rI   r   z"UniversalListingParser._parse_divs   s    99%%e,C"||B|!z!S/AA|DB4yA~$T!W%5%5c%5%F%M%Mc%RS$T!W%5%5c%5%FG1:CF - 
 Cs   B<B<c                l   dg d}}}d}|t        | j                        k  r| j                  |   }|dz   t        | j                        k  r| j                  |dz      nd}|j                         }t        j	                  |      r,|r| j
                  j                  ||d       dg }}d}|dz  }|j                  d	      rVt        |j                  d	            r<|r| j
                  j                  ||d       |j                  d	      g }}d}|dz  }|swd	|v rs|j                  d	      sbt        t        |j                  d	d            \  }}	t        |      r| j                  j                  ||	       |j                  |       |dz  }|sQt        |      rF|rDt        |      s9| j                  j                  ||       |j                  | d
|        |dz  }|j                  |       |dz  }|t        | j                        k  r|r| j
                  j                  ||d       yy)u   
        - identyfikujemy sekcje nagłówkami w stylu „XYZ:”
        - linie w sekcji „Opis …” **nie** są rozbijane na pary klucz:wartość
          (unikamy błędu z bullet-listą)
        r   u   OgólneFr   r@   sectionr   OpisTr   z: r   N)r   rq   r   DESCRIPTION_HDR_RErU   rr   rb   endswithrV   rG   maprQ   r   rt   
setdefault)
r   icurr   in_desclnnxtlowrX   rY   s
             rI   r}   z'UniversalListingParser._parse_free_text   s    2y3#dnn%%..#B+,q53t~~3F+F$..Q'BC((*C "'',%%,,-MN#Rs!Q {{3GBIIcN$;%%,,-MNYYs^Rs"Q sbyS1A+RXXc1-=>11:NN--a3

2Q BK))"c2

bTC5>*Q JJrNFA] #dnn%%` $$%EF rR   c                .   | j                   j                         D ci c])  \  }}t        |      rt        j                  d|      r||+ }}}|D ]!  }| j                   j                  ||   d       # | j                   j                  |       yc c}}w )u   
        Przesunięcie o komórkę – np. pierwsza kolumna zawiera wartości,
        a druga klucze.  Detekcja: w value jest „sensowny klucz”.
        \dN)rt   itemsrV   rM   searchpopr   )r   rX   rY   shifteds       rI   r~   z)UniversalListingParser._fix_shifted_pairs'  s     ,,.
.1qzbiiq1 qD. 	 

 ANNwqz40 g&
s   .Bc                T   t        | j                        D ]  }| j                  |   j                         |j                         c}|rB||k(  s=t	        |      dk  rt        j                  d|      rt        fdt        D              su| j                  j                  |        y )Nr   r   c              3  &   K   | ]  }|v  
 y wr_    )r   r   r   s     rI   r   z=UniversalListingParser._prune_garbage_keys.<locals>.<genexpr><  s     >+=RrSy+=   )
listrt   rE   r   r   rM   r   r   r   r   )r   rX   rY   r   s      @rI   r   z*UniversalListingParser._prune_garbage_keys5  s|    dnn%A^^A&,,.	FAs6FQJryy':>+=>>""1% &rR   c                    i }| j                   D ])  }|j                  |d   g       j                  |d          + |j                         D cg c]  \  }}|t	        |      d c}}| _         y c c}}w )Nr   r   r   )rr   r   extendr   rg   )r   r   stlss        rI   r   z&UniversalListingParser._merge_sections@  sq    ')""Aa	lB/66qzB # ?Elln
>LUQGBK0n
 
s   A0c           	        d}| j                   j                  t        j                  d            D ]5  }t        j                  t        |j                  dd                  s3|} n |rg }|j                  D ]{  }t        |t              r"t        j
                  d|j                        r nIt        |t              sF|j                  dv sU|j                  d |j                  d      D               } |r|| _        yg }| j                  D ]@  }|d	   j                         j!                  d
      s&|j                  d |d   D               B t        dj#                  |            xs d| _        y)u   
        1) Spróbujmy najpierw elegancko: po nagłówku <h?>„Opis …” zbieramy
           <ul>/<ol> i zwracamy list[str].
        2) Jeśli nie ma listy → fallback do dotychczasowego scalania w str.
        Nz^h[1-6]$rL   Tr   >   olulc              3  x   K   | ]2  }|j                  d       rt        |j                  dd              4 yw)Tr   rL   N)r   rQ   )r   lis     rI   r   z>UniversalListingParser._extract_description.<locals>.<genexpr>]  s8      !"4B;;T;2 )S)EF"4s   8:r   r   opisc              3  ^   K   | ]%  }|j                         j                  d       s| ' yw))u   rozwińzobaczN)r   
startswith)r   r   s     rI   r   z>UniversalListingParser._extract_description.<locals>.<genexpr>k  s,      )!779//0EF zs   +-r   )rm   r   rM   compiler   rU   rQ   r   next_siblingsr   r   namer   rv   rr   r   r   join)r   hdrhr   sibr   r   s          rI   r   z+UniversalListingParser._extract_descriptionJ  s>    ##BJJ{$;<A!''(8Ct9T(UV =
 !E((c3'BHH[#((,Kc3'CHH,DLL !"%,,t"4! 	 ) #(  ""A|!!#..v6  z  # ,CHHUO<DrR   c                n   dj                  | j                        j                         t        fdt        D              rd| _        n?t        fdt        D              rd| _        nt        fdt        D              rd| _        | j                  D ]  }t        j                  |j                               }|s)|j                  d      j                  d	d
      j                  dd
      }|j                  d
      }t        |d         dk(  r	|| _         y |d    d
|d    d
|d    | _         y  y )NrL   c              3  &   K   | ]  }|v  
 y wr_   r   r   wjoineds     rI   r   z>UniversalListingParser._detect_availability.<locals>.<genexpr>v  s     9$8qqF{$8r   unavailablec              3  &   K   | ]  }|v  
 y wr_   r   r   s     rI   r   z>UniversalListingParser._detect_availability.<locals>.<genexpr>x  s     8&7f&7r   reservedc              3  &   K   | ]  }|v  
 y wr_   r   r   s     rI   r   z>UniversalListingParser._detect_availability.<locals>.<genexpr>z  s     9&8f&8r   	availabler   rA   -.r      r   )r   rq   r   r   UNAVAILABLE_KEYWORDSrw   RESERVED_KEYWORDSAVAILABLE_KEYWORDSDATE_REr   rZ   rF   r   r   rx   )r   r   r\   rawpartsr   s        @rI   r   z+UniversalListingParser._detect_availabilitys  s
   $..)//19$899'DK8&788$DK9&899%DK ..Brxxz*Aggaj((c2::3D		#uQx=A-C #  7<AhZqq
!ERSH:3V #  !rR   c           	     ,   dj                  t        j                  d | j                  j	                         D        d            }dt        | j                         dt        | j                         dt        | j                  t              rd d	S d d	S )
Nz, c              3  0   K   | ]  \  }}| d |   yw)=Nr   )r   rX   rY   s      rI   r   z2UniversalListingParser.__repr__.<locals>.<genexpr>  s!     L5KTQ1QCj5Ks      z<UniversalListingParser z KV, z imgs | desc=r   rj   >)
r   	itertoolsislicert   r   r   rp   r   rv   r   )r   previews     rI   __repr__zUniversalListingParser.__repr__  s    ))LT^^5I5I5KLaP
 's4>>':&;54##$%]ZHXHXZ^=_62kkln	
ej2kkln	
rR   )returnro   )r  rs   )r  rj   )r   
__module____qualname____annotations__r   rm   r   rp   rq   rr   dictrt   rv   rw   rx   r   rz   r{   r|   r   r   r   r}   r~   r   r   r   r   r  r   rR   rI   ri   ri   l   s    
IU+D-+"tDKD eTBIyB38D4L0  !&5$ GI~G*/UD*IK'IE48FJ8!&E4!@NJ@
$	

B*X	:G|'	&
%ER.
rR   ri   c                4   t        |       }dj                  d |j                  D              }|j                  |j                  |j
                  d|j                  }|j                         D ci c]  \  }}|	|| }}}||j                  |dS c c}}w )u   
    Owijka pod użycie w Playwright-owym pipeline-ie.

    * description może być albo stringiem, albo listą łańcuchów
      – zależnie od tego, czy w HTML-u użyto <ul>/<li>.
    r   c              3  4   K   | ]  }|d    D ]  }|   yw)r   Nr   )r   secr   s      rI   r   z#raw_data_cleaner.<locals>.<genexpr>  s!      )sCLbL)s   )rv   rw   rx   )raw_textrp   
parse_data)	ri   r   rr   rv   rw   rx   rt   r   rp   )rk   parserflat_txtr  rX   rY   s         rI   raw_data_cleanerr    s     $D)Fyy )) H
 !,, -- // 

	J $.#3#3#5G#541a!Q$#5JG  ))!  Hs   1
B<B)rH   rj   r  rj   )rP   rj   r  rj   )rP   rj   r  rT   )r[   rj   r  zTuple[str, str] | None)rc   ro   r  ro   )rk   rj   r  r  )'__doc__
__future__r   r	  rB   loggingrM   dataclassesr   r   typingr   r   r   r	   bs4r
   r   basicConfigINFO	getLoggerr   r   r   r   r   Ir   r   r   r   r  rJ   rQ   rV   r]   rg   ri   r  r   rR   rI   <module>r$     s     #    	 ( . . "   ',, '			8	$ 
   RZZrtt4  ? L '6 
"**EDDL/hY d
 d
 d
X	rR   