
    ,/i^                     8   d dl Z d dlZd dlmZ d dlmZmZ d Zd Zd Z	d Z
d dlmZmZ d dlZd	Zd dlZd dlmZmZ h d
ZdedefdZdedefdZd dlmZ d dlmZ dedefdZdefdZd%defdZd ZdedefdZdedefdZd Zd Zd  Zd! Z d"ed#edefd$Z!y)&    NBeautifulSoup)NetworkMonitoredPage	AdsManualc                 @    | j                  dd      }|dk(  ry|dk(  ryy )N	ifMissingnulltrueTfalseFget)configopts     3/var/www/extractly/manual_agregator/parser/utils.pyresolve_missing_boolr      s'    
**[&
)C
f}T
g~e    c                 z    | j                  d      dk(  r| j                  dd      S | j                  d      dk(  ry y )Nr   defaultdefaultValue r	   r   )r   s    r   resolve_missing_textr      s;    zz+)+zz."--zz+&(r   c                     | sy t        |       j                         j                         j                  dd      }ddddddddddddddd}|j	                         D ]  \  }}||v s|c S  y )N r   PLNEURUSDGBP)   złplnzlu   zł.eureuro   €usddolar$zus$gbpfunt   £)strstriplowerreplaceitems)label_or_texttextCURRENCY_MAPkvs        r   parse_currencyr4      s    }##%++-55c2>D%U%5uEe%5e	L ""$19H % r   c                    | syt        |       j                  dd      j                  dd      }t        j                  d|      }d x}x}}|ru|j	                  d      j                  dd      }	 d|v rt        |      n
t        |      }||j                         d  j                         }t        |      xs t        |      }|||fS # t        $ r d }Y Kw xY w)	N)NNN    r   ,.z([\d\s]+(?:\.\d+)?)   r   )r*   r-   researchgroupfloatint	Exceptionendr+   r4   )r0   tmnumberlabelcurrency
number_strs          r   extract_number_and_labelrG   &   s    D	&#&..sC8A
		(!,A $$F$UXWWQZ''R0
	*-*;U:&ZF !%%'(!!#!%(=N1,=5(""	  	F	s   /C CC)r   Tag)r   r   r#   r!   r&   r$   r)   r'   >   r&   r!   r   r   r#   sreturnc                     | sy| j                  dd      } t        j                  dd|       } | j                         } | j	                  d      r| d d j                         } | j                         S )Nr   r6   r   z\s+:)r-   r:   subr+   endswithr,   )rI   s    r   _norm_labelrP   C   s]    			&#A
vsAA		Azz#crFLLN779r   soupr   c           	      
  '()* | r|r|j                  d      sy|j                  d      }|j                  dg       xs g }|j                  dd      '|j                  dd      (t        |j                  d      xs d	      }|j                  d
g       D ch c]  }|s|	 c})|j                  dd      }|j                  d      }|j                  dd      }dt        dt        f'fd}	t        |      h|D 
ch c]  }
t        |
       c}
z  *dt        dt        f'(*fd}|r| j                  |      n| g}|D ]y  }|j                  d      }|D ]a  }|j                  dd      }t        |      dk  r%|d   j                  dd      } ||      sD|d   j                  dd      }|s]|c c S  { dt        dt        fd}g }|D ]H  }|j                  d      D ]2  }|j                  dd      }|s ||      s"|j                  |       4 J |sydt        dt        f)fd}|D ]  }|dv r|j                         }t        |t              r|j                  |v r ||      r|j                  dd      }|r|c S 	 |j                  d       }|r^|j                  d!      xs d"j                         r;|j                  d!      j                         }d#|v sd$|v sd%|v r y&d'|v sd(|v sd)|v r y*|j                         }t        |t              r|r|}t#        d+      D ]  }t        |t              r|j$                  nd}t        |t              s ng }|D ]  }|j                  |d      D ]u  }  ||       s| |u r| j                  dd      }|s' ||      rdnd}!t'        | d,d      xs d}"t'        |d,d      xs d}#|"|#k\  r	|"s|#r|!dz  }!|j                  |!|f       w  |s|j)                  d- d.       |d   d   c c S  |d/k(  rG|j+                         }$t        |$t              r'|$j                  |v r|$j                  dd      }|r|c S 	 |j                         }%d}&t        |%t              r|&d+k  r|%j                  d       }|r^|j                  d!      xs d"j                         r;|j                  d!      j                         }d#|v sd$|v sd%|v r y&d'|v sd(|v sd)|v r y*|%j                         }%|&dz  }&t        |%t              r|&d+k  r yc c}w c c}
w # t         $ r Y Ew xY w# t         $ r Y w xY w)0u  
    Kompatybilny ekstraktor etykieta→wartość.
    Obsługuje:
      - OtoDom: <div data-sentry-element="ItemGridContainer"><p>ETYKIETA</p><p>WARTOŚĆ</p></div>
      - fallbacki: 'sibling'/'parent'/'next_tag' + valueTags
    Klucze config (tak jak dotychczas, wszystkie opcjonalne poza 'label'):
      - label: str
      - selector: str (zawęź obszar szukania)
      - altLabels: list[str]
      - caseInsensitive: bool (domyślnie True)
      - match: "equals" | "contains" (domyślnie "equals")
      - valueTags: list[str] (domyślnie ["p","span","div"])
      - labelPosition: "sibling" | "parent" | "next_tag" (domyślnie "sibling")
    rD   N	altLabelscaseInsensitiveTmatchequals	valueTags)pspandivvalueClasseslabelPositionsiblingselectorparentFallbackrI   rJ   c                 T    | xs dj                         } r| j                         S | S )Nr   )r+   r,   )rI   case_insensitives    r   normz$extract_value_by_label.<locals>.normi   s&    W"OO,qwwy3!3r   r0   c                     rt        |       n| j                         j                  d      dk(  rt        fdD              S v S )NrL   containsc              3   &   K   | ]  }|v  
 y wN ).0wrA   s     r   	<genexpr>z;extract_value_by_label.<locals>.is_match.<locals>.<genexpr>q   s     .v!qAvv   )rP   r+   rstripany)r0   rA   ra   
match_typewanteds    @r   is_matchz(extract_value_by_label.<locals>.is_matchn   sE    !1Ktzz|7J7J37O#.v...F{r   z=div[data-sentry-element="ItemGridContainer"], div.css-1xw0jqprX   )	recursive   r   r   r+   r9   c                     | xs dj                         t        t        j                  d            xs t	        fdt
        D              S )Nr   z\dc              3   B   K   | ]  }|j                         v   y wrf   r,   )rh   hrA   s     r   rj   z>extract_value_by_label.<locals>._good_value.<locals>.<genexpr>   s     /X1QWWY   )r+   boolr:   r;   rm   _CURRENCY_HINTS)r0   rA   s    @r   _good_valuez+extract_value_by_label.<locals>._good_value   s;    ZR BIIeQ'(XC/X/X,XXr   tagc                     sy| j                  d      syt        t              rj                         t	        fdD              S )NTclassFc              3   &   K   | ]  }|v  
 y wrf   rg   )rh   clsclassess     r   rj   zGextract_value_by_label.<locals>._matches_value_class.<locals>.<genexpr>   s     ;]c3'>]rk   )r   
isinstancer*   splitrm   )r|   r   value_classess    @r   _matches_value_classz4extract_value_by_label.<locals>._matches_value_class   sC    '''"gs#mmoG;];;;r   )r]   parentimgsrcr   	ikona_takicon_yesyestak	ikona_nieicon_nononie   
sourcelinec                     | d   S )Nr   rg   )xs    r   <lambda>z(extract_value_by_label.<locals>.<lambda>   s    !A$r   )keyreversenext_tag)r   tupler*   rP   ry   selectfind_alllenget_textappendrH   find_next_siblingr   namefindr,   r?   ranger   getattrsort	find_next)+rQ   r   rD   
alt_labels
value_tagscstrategyroot_selectorallow_parent_fallbackrb   r   rp   rootsrootrowsrowpslabval_textr{   label_nodeseltxtr   sibvalr   r   up_
candidatestag_namecandscorec_linee_linenxt
value_cellhopsra   rn   r   ro   s+                                          @@@@r   extract_value_by_labelr   M   sg    vVZZ%8zz'*Ezz+r28bJzz"3T:zz'84JVZZ4L8LMJ#)::nb#AG#AaQ#AGMzz/9=Hzz*-M"JJ'7>4 4 4 % !Z$HZ[^Z$HHFs t  +8DKK&dVE {{Z[CcT2B2w{Q%..D.1CC=!u~~c~6H  Y# Y$ Y
 K--%B++c+.Cx}""2& &  <# <$ < ,,&&(CS#&88z).B3.G,,s$,7C"
	!hhuoCGGEN$8b#?#?#A"%''%."6"6"8C*c1Z35F%SV,',*c1Y#5EQT', ++-! S#&& !B1X",R"5RYY4!"c*
 *H "H E3D9$2:$"mmCtm<"$&1#&6A!(|Q!?!D1!(\1!=!B!V+6!QJE"))5#,7 !F !+  OOOE%a=++/ 4 z!,,.C#s#J(>ll3dl3J	--/JDZ-$( ooe,CGGEN0b779''%...0C"c)Z3->%3,$"c)Y#-=$'99;
	 Z-$( X M H %IF % n  		sJ   T* T*=T/
A!T4-T4#BU-U;*U4	U U	UU)OrderedDict)urljoinuc                 X    | sy| j                         } | j                  d      rd| z   S | S )Nr   z//zhttps:)r+   
startswith)r   s    r   _normalize_urlr      s.    		A||D!|Hr   srcsetc                     g }| s|S | j                  d      D ]F  }|j                         j                  d      d   j                         }|s6|j                  |       H |S )uQ   
    Parsuje atrybut srcset i zwraca listę URL-i (bez descriptorów w/h/x).
    r7   r   r   )r   r+   r   )r   urlspartr   s       r   _split_srcsetr     s^     DS!zz|!!#&q)//1KK " Kr   base_urlc                    t               |r| j                  |      n| g}dt        ffd}|D ]  }|j                  d      D ]  } ||j	                  d              ||j	                  d              ||j	                  d              ||j	                  d             |j	                  d      xs |j	                  d	      }|st        |      D ]
  } ||         |j                  d
      D ]A  }	|	j	                  d      xs |	j	                  d	      }|s*t        |      D ]
  } ||        C |j                  d      D ]  }
ddlm}  ||
j                         d      }|j                  d      D ]o  } ||j	                  d              ||j	                  d             |j	                  d      xs |j	                  d	      }|sXt        |      D ]
  } ||        q   | j                  dddi      }|r|j	                  d      r ||d          | j                  dddi      }|r|j	                  d      r ||d          t        j                               S )u   
    Zwraca listę absolutnych URL-i obrazów, bez duplikatów, z zachowaniem kolejności.
    Wspiera: <img src>, lazy atrybuty, <source srcset>, meta og:image/twitter:image.
    Jeśli podasz `selector`, przeszukuje tylko wybrany fragment DOM-u.
    urlc                     t        |       } | sy | j                  d      sC| j                  d      s2r/| j                  d      s| j                  d      st        |       } ny | vrd| <   y y )Nzhttp://zhttps:///zdata:T)r   r   r   )r   r   seens    r   addz extract_image_links.<locals>.add  sg    S!y)S^^J-GS^^C0w8Oh,d?DI r   r   r   zdata-srczdata-originalz	data-lazyr   zdata-srcsetsourcenoscriptr   r   zhtml.parsermetapropertyzog:image)attrscontentr   ztwitter:image)r   r   r*   r   r   r   bs4r   decode_contentsr   listkeys)rQ   r^   r   scopesr   scoper   r   r   r   nsr   innerogtwr   s     `            @r   extract_image_linksr     s    =D&.T[["TFF  >>%(C
#$()$%WWX&@#''-*@F&v.AF / ) >>(+CWWX&@#''-*@F&v.AF / , ..,B *!""4"4"6FE~~e,CGGEN#CGGJ'(*Dcggm.D*62A 3 - -/ L 
6*j!9	:B	bffYByM	6&/!:	;B	bffYByM		r   c                     | yt        | t              r| j                         dk(  ryt        | t        t        f      r| syy)NTr   F)r   r*   r+   r   dict)r3   s    r   value_is_emptyr   X  s6    y!Saggi2od!dD\"1Tr   inactive_rulesc                 6   |xs g D ]  }|j                  d      }|dk(  rI|j                  d      xs dj                         }|s@|| j                  xs dj                         v sa y|dk(  rr|j                  d      }|j                  d      xs dj                         }|r|j                  |      nd }|s|j	                  d      j                         |k(  s y|d	k(  rt|j                  d      }|j                  d      xs dj                         }|r|j                  |      nd }|s,||j	                  d
d      j                         v sQ y|dk(  r*|j                  d      }|sm|j                  |      r y|dk(  s|j                  d      }	|j                  d      xs dj                         }
t        | |	d      }t        |t              rt        j                  |      }|s|
t        |      j                         v s y y)Ntypetext_containsr0   r   Tselector_textr^   rs   selector_containsr   selector_missingsource_field_matchfieldrU   F)r   r,   html
select_oner   r   r   r   jsondumpsr*   )pagerQ   r   rulerA   r   selexpectedr   r   rU   values               r   check_inactiver   ^  s   $"$HHV88F#)r002CstyyB5577/!((:&C(.B557H),%$Bbkkk-335A%%((:&C88F#)r002C),%$BcR[[D[9??AA$$((:&C4??3/&&HHW%EXXg&,"335ED%,E%&

5)#e*"2"2"44; %< r   dc                 b    	 t        d | j                         D              S # t        $ r Y yw xY w)Nc              3   `   K   | ]&  }t        |t              xr d |v xs
 d|v xs d|v  ( yw)	fieldType	selectorsfromMainN)r   r   )rh   r3   s     r   rj   z'is_field_config_dict.<locals>.<genexpr>  s@      
 q$][A%5%\9I%\Z[\_]s   ,.F)rm   valuesr?   )r   s    r   is_field_config_dictr    s;     
XXZ
 
 	
  s   " 	..c                     | sg S 	 t        | t              rt        j                  |       } t        | t
              r| gS t        | t        t        f      rt        |       S g S # t        $ r g cY S w xY w)u   
    Zwraca listę reguł (list[dict]) albo pustą listę.
    Akceptuje: None, str(JSON), dict, list/tuple.
    Inne typy -> [].
    )r   r*   r   loadsr?   r   r   r   )ruless    r   normalize_rulesr    so     	eS!JJu%E %w%$'E{I  	s   %A! !A/.A/c                     | si S t        | t              r*	 t        j                  |       }t        |t              r|S i S t        | t              r| S i S # t
        $ r i cY S w xY w)u   
    Zwraca dict z selektorami (gałęzie lub płaska mapa pól).
    Akceptuje: None -> {}, str(JSON) -> dict lub {},
               dict -> dict, inne -> {}.
    )r   r*   r   r  r   r?   )r  r   s     r   normalize_selectorsr
    si     	)S!	**Y'C$S$/37R7 #9d39;;  	I	s   &A A A! A!c                    d}t        t        | dd            }t        t        | dd            }|D ]  }t        |t              sd|v rd|v r|j                  d      }|r|j                  |      nd}|sE|j                  d      j                         |j                  dg       }t        |t              r|g}t        fd	|D              s|j                  d      } nd
|v sd|v s |r||v rt        ||   t              r||   |fS d|v rt        |d   t              r|d   dfS t        |      r|dfS t        |      dk(  r9t        t        |j                                     \  }	}
t        |
t              r|
|	fS i dfS )u   
    Zwraca ZAWSZE krotkę: (selectors_dict, selected_type|None).
    Obsługuje:
      - reguły starego typu: {'selector','match','type'} / {'source','match','type'}
      - 'default' gałąź
      - płaską mapę pól
      - jedyną gałąź
    Nr  r  r^   r   Trs   rU   c              3   B   K   | ]  }|j                         v   y wrf   rv   )rh   r3   r   s     r   rj   z$resolve_selectors.<locals>.<genexpr>  s     8RqwwyG+Rrx   r   r   flatr9   )r  r   r
  r   r   r   r   r   r,   r*   rm   r  r   nextiterr.   )manual_configrQ   selected_typer  all_selr   r   r   mvr2   r3   r   s              @r   resolve_selectorsr    s    MGM7DABE!'-d"KLG $%&D.((:&C),%$B++D+1779XXgr*b#&bT8R88$(HHV$4Me&D.   '1jAWY]6^}%}44G
79+=t Dy!9,,G$
7|qD)*1aa4K t8Or   c                   	 dt         dt        ffd	t        |       }|D ]  }t        |t               s|j	                  d      }t        |t               s6d|v r$t        	fd|j	                  dg       D              nd}|j	                  dg       }|rt        	fd	|D              nd}|s|s|j	                  d
      c S  y)u   
    Nowy styl:
      { "when": { "all": [...], "any": [...] }, "type": "otodom_v2" }
    Zwraca nazwę gałęzi (type) lub None.
    condrJ   c                 R   d| v rt        j                  | d               S d| v rj                  | d         d u S d| v r/| j                  d      }t        |xr j                  |            S d| v r1| j                  d      }t        |      xr j                  |      d u S d| v r| d   j                  d      }| d   j                  d      xs dj	                         }|rj                  |      nd }t        |xr# ||j                  d	d
      j	                         v       S y)Nfield_emptyfield_missingselector_existsr   r   r^   r0   r   r   Trs   F)r   r   ry   r   r,   r   )r  r   r   r   	extractedrQ   s       r   cond_okz$apply_dynamic_rules.<locals>.cond_ok  s+   D !)--]0C"DEEd"==o!674??$((,-C4 455%((-.C9?$//#"6$">?$&*+//
;C+,008>BEEGC),%$BJsbkk#Tk&B&H&H&JJKKr   whenallc              3   .   K   | ]  } |        y wrf   rg   rh   r   r  s     r   rj   z&apply_dynamic_rules.<locals>.<genexpr>  s     =)<AWQZ)<   Trm   c              3   .   K   | ]  } |        y wrf   rg   r   s     r   rj   z&apply_dynamic_rules.<locals>.<genexpr>  s     2AWQZr!  r   N)r   ry   r  r   r   r  rm   )
r  rQ   r  
rules_listr   r  all_okany_listany_okr  s
    ``      @r   apply_dynamic_rulesr'    s    d t $ !'J$%xx$%AF$=%)<==TX88E2&6>222Df88F##  r   datar   c                 |   t         j                  j                  D ch c]  }|j                  dvr|j                   }}| xs i j	                         D ci c]  \  }}||v s|| }}}d|v r#| xs i j                  d|j                        |d<   d|v r>| xs i j                  d      }|"t        |di       xs i }|j                  d      }||d<   dD ])  }	|	|v s|j                  |	      t        ||	d       ||	<   + d|v rd|vr|j                  |d<   d|v rd| xs i v r| d   |d<   	 |j                  dd       }
|
d u xs# t        |
t              xr |
j                          }|r|j                  d	| xs i j                  d	            }|j                  d
| xs i j                  d
            }d } ||      } ||      }|(|&|dkD  r!t        t        ||z              }d|v r||d<   nd|v r	d|vrd |d<   	 d|v rF|j                  d| xs i j                  d            }dt        dt        fd} ||      }|d}||d<   	 ddlmm fd}dD ]!  }||v s ||j                  |            ||<   # 	 |S c c}w c c}}w # t        $ r Y w xY w# t        $ r Y ]w xY w# t        $ r Y |S w xY w)N)id
created_at	is_activeinactive_reasonr   )	image_linksr   r   inactive_datedate_fetchedr+  r   estate_type
offer_typer   original_image_urlsprice_per_m2pricesquare_footagec                     | y t        | t        t        f      rt        |       S 	 t        |       j	                  dd      j	                  dd      j	                  dd      }t        |      S # t
        $ r Y y w xY w)Nr6   r   r   r7   r8   )r   r>   r=   r*   r-   r?   )r3   rI   s     r   	_to_floatz+map_data_to_manual_model.<locals>._to_floatA  sp    9a#u. 8O Avs3;;CDLLSRUVA 8O    s   AA, ,	A87A8r   	area_unitr   rJ   c                 ~   | y t        |       j                         j                         }|j                  dd      j                  dd      j                  dd      j                  dd      j                  dd      }dd l} |j
                  d|      }|r|j                  d	      S d
|v r	d|v sd|v ryd|v ryd|v ryy )Nu   m²m2zm^2zm kwmkwzm.2r   z\b(m2|ha|ar|ft2|sqft)\br9   rB   2   ²hektarhaar)r*   r+   r,   r-   r:   r;   r<   )r   rI   r:   rB   s       r   _normalize_unitz1map_data_to_manual_model.<locals>._normalize_unit^  s    ;HNN$**,IIeT*225$?GGPTU]]^ceijrrsxz~BII8!<771:%!8TQYq=19r   r;  )datetimedatec                    | y t        |       rt        |       s| S t        |       j                         }|sy |j                  dd      j                  dd      j                  dd      j	                         }h d}||v ry g d}|D ]$  }	 j                  ||      j                         c S  y # t        $ r Y 3w xY w)Nu   „"u   ”u   ’'>      —n/an/d
nie podanonie dotyczybrak informacji-brak)z%Y-%m-%dz%d.%m.%Yz%d/%m/%Yz%d-%m-%Y)r   r*   r+   r-   r,   strptimerD  r?   )r   rI   s_normplaceholdersfmtsfmtrD  rC  s         r   _clean_date_valz1map_data_to_manual_model.<locals>._clean_date_val  s    {#t$ZX-F
C AYYx-55hDLLXWZ[aacFmL%D #,,Q499;;   ! s   B22	B>=B>)available_fromlisting_date)r   _metafieldsr   r.   r   r,  r   r   r   r*   r+   r>   roundr?   rm   rC  rD  )r(  r   fallowedr2   r3   mappedreasonr   r   pp2is_missing_pp2	price_valarea_valr8  rX   acomputedraw_urB  rb   rU  dfrD  rC  s                          @@r   map_data_to_manual_modelrg  
  s/    '''A66-- 	
'   !%
113D3tq!qG|ad3FD g#zr..{DNNK{G#*"!!"34>4,2DXX/0F$* ! '>vzz#6!$T2F3K E/u ',Adjb,Q(,-B(C$%jj.+T:c3+?+S		O

7TZR,<,<W,EFIzz"2TZR4D4DEU4VWH	  )$A(#A}1q5uQU|,!W,-5F>*!W,.2N-1F>*'!JJ{TZR,<,<[,IJES S & #5)D|"&F;&+	< 5BV|,VZZ^<r
 5 Ms EJ  @  T  M	sO   !JJ
&J
1CJ A
J J. &J. 	JJ	J+*J+.	J;:J;)NN)"r   r:   r   r   extractly.modelsr   r   r   r   r4   rG   rH   rz   r*   rP   r   r   collectionsr   urllib.parser   r   r   r   r   r   ry   r   r  r  r
  r  r'  rg  rg   r   r   <module>rk     s      <#& # 	G 	 "33 3 ] ] ]P $  c c # Ds DTt  BD T &<$-b$TZ4 Z/C Z Zr   